之前用xpath来匹配内容页面,匹配的嘟是纯文字,遇到图片还需要特殊处理,有时候需要采集新闻,带上原来的部分样式可以更好的二次处理。
import requests import re url = 'https://www.qiushibaike.com/article/119998177' # url ='http://www.cnyifeng.net/news/show-469.html' headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"} response = requests.get(url,headers=headers) response.encoding = 'utf-8' html_str = response.text pattern = re.compile(r'<div class="content">([\s\S]*?)</div>') #匹配指定标签文本 content_str = pattern.findall(html_str) print(content_str[0]) # print(str(content_str[0]).strip().replace('\n','')) # print(response.content.decode('utf-8'))