1 import requests 2 from selenium import webdriver 3 import time 4 def grasp(urlT): 5 driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') #自动化测试程序工具本地所在地 6 resAll = [] #用于存储单条数据 7 rest = {} #用于存储单个数据 8 urls = [] 9 res=requests.get(urlT) 10 for i in range(0,29): 11 12 print(f'第{i+1}条新闻开始') 13 print(res.json()['data'][i]['title']) 14 try: 15 print(res.json()['data'][i]['newsTime']) 16 except: 17 print('None') 18 print(res.json()['data'][i]['source']) 19 rest['title']=res.json()['data'][i]['title'] 20 try: 21 rest['newsTime'] = res.json()['data'][i]['newsTime'] #有的时间没有 22 except: 23 rest['newsTime'] = 'None' 24 rest['source'] = res.json()['data'][i]['source'] 25 url = res.json()['data'][i]['url'] 26 rest['url'] = res.json()['data'][i]['url'] 27 28 try: 29 30 driver.get(url) 31 time.sleep(4) 32 contend = driver.find_element_by_class_name('text-3zQ3cZD4').text 33 rest['contend'] = str(contend) 34 print(f'第{i+1}条新闻结束') 35 time.sleep(6) 36 except: 37 rest['contend'] = '嵌套' 38 time.sleep(6) 39 l = driver.find_elements_by_xpath("//p[@class='text-3YbAxaNR']") #获取连接个数 40 s = driver.find_elements_by_xpath("//p[@class='text-3YbAxaNR']/a") #获取当前页面所有链接 41 for j in range(0,len(l)): 42 ss = s[j].get_attribute('href') 43 print(type(ss)) 44 try: 45 urls.append(str(str(ss).split()).replace('"','').replace("'","").replace('[','').replace(']','')) #将链接转化为可以存储的字符串 46 print(urls) 47 except: 48 print(driver.find_element_by_class_name('topic-3bY8Hw-9').text) #输出标题 49 resAll.append(rest) 50 with open('./news.txt', 'a+', encoding='utf-8') as f: 51 try: 52 f.write(''.join(resAll[i].values())+'\n') 53 except: 54 print('写入失败') 55 56 57 resAll.clear() 58 print(urls) 59 for k in range(0,len(urls)): 60 try: 61 driver.get(urls[k]) 62 # time.sleep(3) 63 rest['title1'] = driver.find_element_by_class_name('topic-3bY8Hw-9').text 64 rest['source1'] = driver.find_element_by_class_name('source-2pXi2vGI').text 65 rest['newsTime1'] = driver.find_element_by_xpath('//p[@class="time-hm3v7ddj"]/span').text 66 rest['contend1'] = driver.find_element_by_class_name('text-3zQ3cZD4').text 67 resAll.append(rest) 68 time.sleep(4) 69 with open('./news.txt', 'a+', encoding='utf-8') as f: 70 time.sleep(5) 71 f.write(''.join(resAll[k].values()) + '\n') 72 except: 73 print('内容太多,服务器禁止') 74 75 76 url = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219" #财经的api 77 t = grasp(url)
已经实现了嵌套网页信息的获取,
可直接使用