因为老师要求实验报告中附带参考文献原文,所以编写了这个文件。(作业果然是最佳动力
Python的pdf工具还是很全的,本次使用了wkhtmltopdf接口
因为基本不会报什么错,所以写成了命令行运行形式
一次只能处理一个url
因为根据自己需要编写的,放到博客上只是给小伙伴们参考思路,根据自己的需求修改~
#!/usr/bin/python
#@Author: zhongshsh
import requests
from bs4 import BeautifulSoup, NavigableString
import urllib
import pdfkit
import sys
# 获取网页内容
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
# 删除超链接,保留标签内的内容
def strip_tags(html, invalid_tags):
soup = BeautifulSoup(html, 'lxml')
for tag in soup.findAll(True):
if tag.name in invalid_tags:
s = ""
for c in tag.contents:
if not isinstance(c, NavigableString):
c = strip_tags(str(c), invalid_tags)
s += str(c)
tag.replaceWith(s)
return soup
# 删除一些标签
def strip_ct(soup):
[s.extract() for s in soup(class_="article-bar-top")]
[s.extract() for s in soup(class_="href-article-edit slide-toggle")]
[s.extract() for s in soup(class_="person-messagebox")]
return soup
# 过滤网页信息
def get_main(html):
soup = BeautifulSoup(html,'lxml')
i=strip_tags(str(strip_ct(soup.find(attrs={
'class':"blog-content-box"}))),['a'])
return str(i)
# 生成pdf
def html_pdf(html):
path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
options = {
'page-size':'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'no-outline': None
}
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
pdfkit.from_string(html, 'data.pdf', options=options, configuration=config)
if __name__ == '__main__':
url = sys.argv[1]
with open('data.pdf', 'w') as f:
f.write('')
# url_list = ['https://blog.csdn.net/u013803499/article/details/82877993']
html_pdf(get_main(get_html(url)))
结果的部分截图