爬取凤凰网站财经类的新闻,函数时编程,可全部实现,由于内容量大,需要时间太长,服务器会禁止,为了防止,可以将time.sleep()设置的时间长点

 1 import requests
 2 from selenium import webdriver
 3 import time
 4 def grasp(urlT):
 5     driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') #自动化测试程序工具本地所在地
 6     resAll = []         #用于存储单条数据
 7     rest = {}           #用于存储单个数据
 8     urls = []
 9     res=requests.get(urlT)
10     for i in range(0,29):
11         
12         print(f'第{i+1}条新闻开始')
13         print(res.json()['data'][i]['title'])
14         try:
15             print(res.json()['data'][i]['newsTime'])
16         except:
17             print('None')
18         print(res.json()['data'][i]['source'])
19         rest['title']=res.json()['data'][i]['title']
20         try:
21             rest['newsTime'] = res.json()['data'][i]['newsTime'] #有的时间没有
22         except:
23             rest['newsTime'] = 'None'
24         rest['source'] = res.json()['data'][i]['source']
25         url = res.json()['data'][i]['url']
26         rest['url'] = res.json()['data'][i]['url']
27 
28         try:
29 
30             driver.get(url)
31             time.sleep(4)
32             contend = driver.find_element_by_class_name('text-3zQ3cZD4').text
33             rest['contend'] = str(contend)
34             print(f'第{i+1}条新闻结束')
35             time.sleep(6)
36         except:
37             rest['contend'] = '嵌套'
38             time.sleep(6)
39             l = driver.find_elements_by_xpath("//p[@class='text-3YbAxaNR']")  #获取连接个数
40             s = driver.find_elements_by_xpath("//p[@class='text-3YbAxaNR']/a") #获取当前页面所有链接
41             for j in range(0,len(l)):
42                 ss = s[j].get_attribute('href')
43                 print(type(ss))
44                 try:
45                     urls.append(str(str(ss).split()).replace('"','').replace("'","").replace('[','').replace(']','')) #将链接转化为可以存储的字符串
46                     print(urls)
47                 except:
48                     print(driver.find_element_by_class_name('topic-3bY8Hw-9').text) #输出标题
49         resAll.append(rest)
50         with open('./news.txt', 'a+', encoding='utf-8') as f:
51                 try:
52                     f.write(''.join(resAll[i].values())+'\n')
53                 except:
54                     print('写入失败')
55 
56 
57     resAll.clear()
58     print(urls)
59     for k in range(0,len(urls)):
60         try:
61             driver.get(urls[k])
62             # time.sleep(3)
63             rest['title1'] = driver.find_element_by_class_name('topic-3bY8Hw-9').text
64             rest['source1'] = driver.find_element_by_class_name('source-2pXi2vGI').text
65             rest['newsTime1'] = driver.find_element_by_xpath('//p[@class="time-hm3v7ddj"]/span').text
66             rest['contend1'] = driver.find_element_by_class_name('text-3zQ3cZD4').text
67             resAll.append(rest)
68             time.sleep(4)
69             with open('./news.txt', 'a+', encoding='utf-8') as f:
70                 time.sleep(5)
71                 f.write(''.join(resAll[k].values()) + '\n')
72         except:
73             print('内容太多,服务器禁止')
74 
75 
76 url = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219" #财经的api
77 t = grasp(url)
View Code

已经实现了嵌套网页信息的获取,

可直接使用

猜你喜欢

转载自www.cnblogs.com/superSmall/p/11528066.html