这是一个朋友做的旅游方面的分析,需要游客的所有笔记的文本,并且保存在一个txt里
源码如下:
import requests
from lxml import etree
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
for num in range(1,81,1):
print(num)
url = 'https://you.ctrip.com/searchsite/travels/?query=%e9%81%bf%e6%9a%91%e6%97%85%e6%b8%b8&isAnswered=&isRecommended=&publishDate=365&PageNo='+str(num)
html = requests.get(url).text
txt = etree.HTML(html)
file = txt.xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/ul/li')
for t in file:
href = t.xpath('./dl/dt/a/@href')[0]
hrefUrl = 'https://you.ctrip.com'+href
print (hrefUrl)
html = requests.get(url = hrefUrl,headers=headers).text
soup = BeautifulSoup(html, "html.parser")
t = soup.find(attrs={"class": "ctd_content"})
txt = t.get_text().replace("\n","")
txt = str(txt)
filename = 'G:write_data.txt'
with open(filename,'a',encoding='utf-8') as f: # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
f.write(txt)
f.close()
print ("获取完毕!")
爬取结果:
词云分析:
# coding: utf-8
from wordcloud import WordCloud
import cv2
import jieba
with open(r'G:\write_data.txt','r',encoding='utf-8') as f:
text = f.read()
cut_text =" ".join(jieba.cut(text))
color_mask = cv2.imread('G:1234.jpg')
cloud = WordCloud(
#设置字体,不指定就会出现乱码
font_path=" C:\\Windows\\Fonts\\STXINGKA.TTF",
#font_path=path.join(d,'simsun.ttc'),
#设置背景色
background_color='white',
#词云形状
mask=color_mask,
#允许最大词汇
max_words=2000,
#最大号字体
max_font_size=40
)
wCloud = cloud.generate(cut_text)
wCloud.to_file('cloud.jpg')
import matplotlib.pyplot as plt
plt.imshow(wCloud, interpolation='bilinear')
plt.axis('off')
plt.show()
词云结果: