import requests import re import jieba from bs4 import BeautifulSoup from datetime import datetime def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'gb2312' soupd = BeautifulSoup(resd.text, 'html.parser') content = soupd.select('#endText')[0].text info = soupd.select('.post_time_source')[0].text date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1) dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') sources = re.search('来源:\s*(.*)', info).group(1) TopWords = getTopWords(content) print('发布时间:{0}\n来源:{1}'.format(dateTime, sources)) print('关键词:{}、{}、{}、{}、{}'.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4])) print(content) print('---------------------------') fo = open("D:\python/test.txt", 'a', encoding='utf8') fo.write(content) fo.write('\n') fo.close() def getTopWords(content): str = '''一!“”,。?;’"',.、:\n''' for s in str: content=content.replace(s, ' ') wordlist = list(jieba.cut(content)) exclude = {'这', '\u3000', '\r', '\xa0','时候','对','上','与','等','不','','没有','很多','的','大','出来', '_', '到',' ', '将', '在', '是', '了', '一', '还', '也', '《', '》', '(', ')','和','我','我们','其','能够','以','个','短','中','是','不是'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) return dictlist; def getListPage(listUrl): res = requests.get(listUrl) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') for new in soup.select('#news-flow-content')[0].select('li'): url = new.select('a')[0]['href'] title = new.select('a')[0].text print('标题:{0}\n链接:{1}'.format(title, url)) getNewsDetail(url) break listUrl = 'http://tech.163.com/internet/' getListPage(listUrl) for i in range(2, 10): listUrl = 'http://tech.163.com/special/it_2016_%02d/' % i getListPage(listUrl)
爬虫大作业02
猜你喜欢
转载自www.cnblogs.com/wxyplu/p/9069124.html
今日推荐
周排行