热点时事新闻文章采集:
- 仅下载当天最新、热点的时事新闻;
- 不同网站的新闻保存在不同文件夹中,并记录每篇新闻的来源、标题、发布时间、下载时间、url地址等信息;
爬虫初始种子:新浪(news.sina.com.cn)、搜狐(news.sohu.com)、凤凰(news.ifeng.com)、网易(news.163.com)、百度(news.baidu.com)
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'http://news.sohu.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {
'id': 'block4'})#找到新闻标题的所在标签名称
i = 0
k = 0
# print(p1)
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="(.*?)" ')
l = pattern.findall(href)
prefix = 'http://news.sohu.com'
ls = [prefix + url for url in l]
print(ls)
title = [[] for _ in range(50)]
data = [[] for _ in range(50)]
source = [[] for _ in range(50)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1').text
title[i]= ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))
print(title[i])
data[i] = soup.find('span', class_='time').text
print(data)
source[i] = soup.find('span', {
'data-role': 'original-link'}).text.strip()
s1 = soup.findAll('article', {
'class': 'article'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open(r'%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
i = i+1
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.ifeng.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {
'class': 'index_content_RQk8t'})#找到新闻标题的所在标签名称
i = 0
print(p1)
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="//(.*?)" ')
l = pattern.findall(href)
prefix = 'http://'
ls = [prefix + url for url in l]
print(ls)
title = [[] for _ in range(100)]
data = [[] for _ in range(100)]
source = [[] for _ in range(100)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='index_topic_5hyUE').get_text()
print(title[i])
data[i] = soup.find('div', class_='index_timeBref_20hzr').text
print(data[i])
source[i] = soup.find('div', class_='index_sourceTitleText_wlTy-').text
print(source[i])
s1 = soup.findAll('div', {
'class': 'index_main_content_j-HoG'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open(r'%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
i = i+1
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.163.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {
'class': 'news_default_news'})#找到新闻标题的所在标签名称
i = 0
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')
ls = pattern.findall(href)
title = [[] for _ in range(25)]
data = [[] for _ in range(25)]
source = [[] for _ in range(25)]
while i < ls.__len__():
try:
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='post_title').get_text()
title[i] = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))
print(title[i])
div_tag = soup.find('div', class_='post_info')
data[i] = div_tag.get_text(strip=True).split(' ')[0]
print(data[i])
source[i] = soup.find("div", class_="post_info").find("a").text
print(source[i])
s1 = soup.findAll('div', {
'class': 'post_body'})
for each in s1:
hr = each.select('p')
#print(hr)
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js=findjs.findall(hr)
print(js)
file3 = open(r'网易/%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
except:
i = i + 1
continue
i = i+1
import re
import requests
from bs4 import BeautifulSoup
import datetime
def kaishi():
url = 'https://news.sina.com.cn/'
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36"
}
res = requests.get(url,headers=head)
res.encoding='UTF-8'
content = BeautifulSoup(res.text, "html.parser")
data = content.find_all('div', attrs={
'class': 'ct_t_01'})
data=str(data)
findurl=re.compile('<a.*?href="(.*?)".*?target="_blank">(.*?)</a>')
Geturl=re.findall(findurl,data)
for v in Geturl:
print(v[1]+":"+v[0])
fileObject = open('shiyan.txt', 'w')
for ip in Geturl:
fileObject.write(ip[1]+":"+ip[0])
fileObject.write("\n")
for v in Geturl:
sonurl=v[0]
sondata=requests.get(url=sonurl,headers=head)
sondata.encoding='UTF-8'
soup=BeautifulSoup(sondata.text,features="html.parser")
# print(soup)
item=soup.find_all("div",attrs={
'class':'article'})
item=str(item)
time=soup.find_all("span",attrs={
'class':'date'})
date=re.compile('<span class="date">(.*?)</span>')
time=str(time)
newtime=re.findall(date,time)
txt=re.compile('<p>(.*?)</p>')
# print(item)
gettxt=re.findall(txt,item)
new_str = ''.join(newtime)
print(new_str)
fileObject.write(new_str)
fileObject.write("\n")
for x in gettxt:
fileObject.write(x)
fileObject.write("\n")
print(x)
# print(item)
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
print(time_str)
fileObject.write(time_str)
fileObject.write("\n")
print("\n")
fileObject.close()
if __name__ == "__main__":
kaishi()