PDF其实就是电子书籍,但是PDF这个格式允许插图,更加适合人们观看!
(私信小编007可以获取很多本Python相关的PDF!!)
有人会说,这是黑客技术?这么强大!
NO,一点都不强大。就比如上次发布的Python无需百度积分下载百度文库一样,不过是一个API的调用!
扫描二维码关注公众号,回复:
3486583 查看本文章
好了,下面看效果:
源码:
# python3 代码 # 需要安装 requests 和 BeautifulSoup # 从若兰格提取 pdf import requests from bs4 import BeautifulSoup as bp4 import json BaiDuAPI_URL = "http://ypsuperkey.meek.com.cn/api/v1/items/BDY-{0}?client_version=2018.11" # 设置代{过}{滤}理 def req(url_para): headers = { 'Host': 'ypsuperkey.meek.com.cn', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'close', } req = requests.get(BaiDuAPI_URL.format(url_para),headers = headers) if req.status_code == 200: return req.text return '' # print(req.text) # ============================================================================= RUGE_HOME = "http://www.ifblue.net/" RUGE_SEARCH_URL = "http://www.ifblue.net/search/{0}/page/{1}" headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'DNT': '1', 'Host': 'www.ifblue.net', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } def req_html(url): req = requests.get(url,headers=headers) if req.status_code == 200: return req.text return "" def main(): key = input('搜索关键词:') page = int(input('查询页面:')) items = [] print('当前搜索为 ' , key) count = 1 for p_ in range(page): pp_ = p_ + 1 print('当前页面 : ',pp_) html = req_html(RUGE_SEARCH_URL.format(key,pp_)) html_obj = bp4(html,"lxml") # 显示标题和链接 articles = html_obj.find_all( name = 'article',attrs= {'class':"excerpt"}) if(len(articles)< 1): exit() for art_ in articles: item = {} item['title'] = art_.header.h2.a['title'] item['url'] = art_.header.h2.a['href'] items.append(item) print('{0:3} {1:40} {2}'.format(count,item['title'],item['url'])) count += 1 # 进行下载 PdfItems = [] c_ = 1 for d_ in items: print('当前下载',d_['title'],end = ' ') downurl = d_['url'].replace('.html','').replace('http://www.ifblue.net/','http://www.ifblue.net/download.html?pid=') # print(downurl) html = req_html(downurl) html_obj = bp4(html,"lxml") dd_ = html_obj.find('center') baiduurl = dd_.a['href'] # 请求密码 # print(dd_.a['href']) try: baidukey = req(baiduurl.replace('https://','').replace('http://','').replace('pan.baidu.com/s/1','')) # print(downurl) # print(dd_) # print(baiduurl) # print(baidukey) j_obj = json.loads(baidukey) PdfItem = {} PdfItem['id'] = c_ PdfItem['title'] = d_['title'] PdfItem['url'] = baiduurl PdfItem['code'] = j_obj['access_code'] PdfItems.append(PdfItem) print('{0} {1}'.format(baiduurl,j_obj['access_code'])) c_ += 1 except Exception as e: pass print(' 获取失败 ') # 进行下载 # 将解析的保存到文件 with open('down.txt','w') as pf: for t_ in PdfItems: pf.write('{0:3} 名称: {1} 链接: {2} 密码: {3} '.format(t_['id'],t_['title'],t_['url'],t_['code'])) if __name__ == '__main__': main()