import requests as req
import re
from bs4 import BeautifulSoup
sougou_url = "http://weixin.sogou.com/weixin?type=1&query=新闻哥"
r1 = req.get(sougou_url)
soup=BeautifulSoup(r1.text)
data = str(soup.find_all(uigs="account_image_0"))
print(data)
reg_str = r'href="(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,data)
xinwenge_url = items[0].replace('amp;','')
def get_xinwenge_content(link):
r2 = req.get(link)
soup = BeautifulSoup(r2.text).body
content = ""
for one in soup(class_="rich_media_content "):
content += one.get_text()
return content
r = req.get(xinwenge_url)
print(r.status_code)
data = r.text
#print(data)
soup=BeautifulSoup(data)
body = str(soup.body)
print(body)
print(type(body))
reg_str = r'"author".*?"content_url":"(.*?)".*?"copyright_stat":(.*?),.*?"title":"(.*?)"}'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,data)
print(type(items))
#for item in items:
# print(item[1])
print("----------------")
reg_str2 = r'"app_msg_ext_info".*?"content_url":"(.*?)","copyright_stat":(.*?),.*?"is_multi".*?"subtype":9,"title":"(.*?)"},"comm'
pattern2 = re.compile(reg_str2,re.DOTALL)
items2 = re.findall(pattern2,data)
items3 = items+items2
for i in items3:
title = i[2]
wibsite = xinwenge_url
is_original = True
url = "https://mp.weixin.qq.com"+str(i[0]).replace('amp;','')
content = get_xinwenge_content(url)
author = "新闻哥"
print(url)
通过搜狗的公众号搜索爬微信公众号文章
猜你喜欢
转载自blog.csdn.net/lk7688535/article/details/78074074
今日推荐
周排行