一、需求
获取“起点中文网”的书名、月票数据
二、思路分析
(1)获取书名
响应得到的是html类型,可以通过xpath提取书名
(2)获取月票
同样通过xpath提取月票,发现无法正常获取
在response中发现,月票是以𘢄的格式呈现的,这是字体加密反爬操作
可以通过正则提取响应得到的文本数据,发现无法正常获取到数据
测试发现,class的属性值是会变化的
获取字体加密数据包,可以根据woff后缀、size大小来判断
因为每一次请求的字体加密数据包都会发生变化,所以需要实时获取对应的url,向url发送请求,并下载字体加密数据包
解析字体加密文件,创建字体对象、转成xml明文格式
获取字体加密文件中的字体加密映射表cmap
将字体加密映射表的英文转换成阿拉伯数字
将密文的特殊字符&#去除(剩余纯数字)、将纯数字密文换成明文
(3)整理数据
三、代码分享
import requests
from lxml import etree
from fontTools.ttLib import TTFont
import re
# 1.确认目标url
url = "https://www.qidian.com/rank/yuepiao/"
# 2.发送请求,获取响应
headers = {
"Cookie": "e1=%7B%22pid%22%3A%22qd_P_rank_19%22%2C%22eid%22%3A%22%22%2C%22l1%22%3A34%7D; e2=%7B%22pid%22%3A%22qd_P_rank_19%22%2C%22eid%22%3A%22%22%7D; e1=%7B%22pid%22%3A%22qd_P_rank_01%22%2C%22eid%22%3A%22qd_C45%22%2C%22l1%22%3A5%7D; e2=%7B%22pid%22%3A%22qd_P_rank_01%22%2C%22eid%22%3A%22qd_C46%22%2C%22l1%22%3A5%7D; newstatisticUUID=1636507075_361751491; _csrfToken=FY37tuXKZgpka9FClqCf2Wjbw8qNVtHeABymjqPk; qdrs=0%7C3%7C0%7C0%7C1; showSectionCommentGuide=1; qdgd=1; lrbc=1027669580%7C697684798%7C1; rcr=1027669580; hiijack=0; gender=male; fu=511383626; _gid=GA1.2.1338733297.1652162834; COOKIE_BOOKLIST_TIPS=1; _ga_D20NXNVDG2=GS1.1.1652268082.3.0.1652268082.0; _ga_VMQL7235X0=GS1.1.1652268082.3.0.1652268083.0; e1=%7B%22pid%22%3A%22qd_p_qidian%22%2C%22eid%22%3A%22qd_A16%22%2C%22l1%22%3A3%7D; e2=%7B%22pid%22%3A%22qd_p_qidian%22%2C%22eid%22%3A%22qd_A16%22%2C%22l1%22%3A3%7D; _yep_uuid=cb6a56e2-7718-b39b-848b-ae3888468f71; _ga_FZMMH98S83=GS1.1.1652352841.16.1.1652352888.0; _ga_PFYW0QLV3P=GS1.1.1652352841.16.1.1652352888.0; _ga=GA1.2.1915267229.1636507067",
"Referer": "https://www.qidian.com/rank/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36"
}
response = requests.get(url, headers=headers)
# 3.提取数据
# print(response.content.decode())
# 3.1 获取书名
html_data = etree.HTML(response.content.decode())
book_names = html_data.xpath("//h2/a/text()")
print("书名:", book_names)
# 3.2 获取月票
res = response.content.decode()
# </style><span class="fYmdRxmC">𘜬𘜰𘜩𘜪𘜥𘜩</span></span>月票</p>
monthly_ticket = re.findall(r'</style><span class=".*?">(.*?);</span></span>月票</p>', res)
print("月票密文:", monthly_ticket)
# 获取字体加密文件的url
# format('eot'); src: url('https://qidian.gtimg.com/qd_anti_spider/CSfbJBtq.woff') format('woff')
woff_url = re.findall(r"format\('eot'\); src: url\('(.*?)'\) format\('woff'\)", res)[0]
print("字体加密文件url: ", woff_url)
# 向字体加密文件的url发送请求、并下载文件
woff_response = requests.get(woff_url, headers=headers)
with open("font.woff", "wb") as f:
f.write(woff_response.content)
# 解析字体加密文件
font_obj = TTFont("font.woff")
font_obj.saveXML("new_font.xml")
# 获取字体加密文件映射表
cmap = font_obj.getBestCmap()
print("字体加密映射表:", cmap)
# 将字体加密映射表的英文转换成阿拉伯数字
"""
{100060: 'six', 100062: 'nine', 100063: 'five', 100064: 'zero', 100065: 'two', 100066: 'period',
100067: 'four', 100068: 'one', 100069: 'eight', 100070: 'three', 100071: 'seven'}
转成{100060: "6", 100062: "9", 100063: "5", 100064: "0", 100065: "2", ....}
"""
num_dict = {
"one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6",
"seven": "7", "eight": "8", "nine": "9", "zero": "0", "period": "."
}
for i in cmap:
# print(i)
for j in num_dict:
if cmap[i] == j: # 如果映射表cmap的值等于num_dict的键
cmap[i] = num_dict[j] # 则将映射表cmap键所对应的值改成num_dict键所对应的值
print("映射表转换后的结果:", cmap)
# 将字体密文monthly_ticket转换成明文
# (1)将密文的特殊字符去除
for b in enumerate(monthly_ticket):
num_list = re.findall(r"\d+", b[1]) # 去除&# (+ 匹配前一个字符1次或无数次)
monthly_ticket[b[0]] = num_list
print("去除特殊字符后的月票密文:", monthly_ticket)
# (2)将纯数字密文换成明文
for i in monthly_ticket: # i = ['100514', '100509', '100507', '100514', '100513'] 一本书的月票数据
for j in enumerate(i): # j = (索引,'100514')
for k in cmap: # k = 100514
if j[1] == str(k):
i[j[0]] = cmap[k]
print("解析后的月票明文:", monthly_ticket)
# 拼接明文
new_list = []
for i in monthly_ticket:
j = ''
for k in i:
j = j + k
new_list.append(j)
print("月票明文数据结果:", new_list)
# 将书名和对应的月票数据一一对应
result = dict(zip(book_names, new_list))
print('result:', result)
- 写在最后:有任何代码问题,欢迎交流~