一、首先爬取所有的申报通知的网址链接
import requests
import re
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
browser = webdriver.Chrome('E:/python/chromedriver.exe')
url = 'http://jgxy.jhc.cn/jxkysy/list.htm'
browser.get(url)
title = []
url = []
new_str1 = '申报'
new_str2 = '通知'
for i in range(1,34):
browser.find_element_by_class_name('pageNum').send_keys(i)
browser.find_element_by_class_name('pagingJump').click()
x = browser.find_elements_by_class_name('Article_Title')
for a in x:
z = a.find_element_by_tag_name('a').get_attribute('href')
if new_str1 in str(a.text) and new_str2 in str(a.text):
title.append(a.text)
url.append(str(z))
df = {
'标题':title,
'链接':url
}
the_url = pd.DataFrame(df)
the_url.to_csv('标题链接.csv')
二、读取刚刚生成的网址链接excel,进行爬取
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
url = pd.read_csv('标题链接.csv')
title_list = []
data = []
the_text = []
the_url = []
for i in url['链接']:
try:
time.sleep(0.5)
html = requests.get(i)
bs4 = BeautifulSoup(html.content, 'lxml')
the_title = bs4.find(name='h1', class_='actitle').text
Data = bs4.find(name='span', class_='Article_PublishDate').text
text = bs4.find(name='div', class_='Article_Content').text
title_list.append(the_title)
data.append(Data)
the_text.append(text)
the_url.append(i)
except:
print('需要权限的网址:' + i)
df = {
'链接': the_url,
'标题': title_list,
'日期': data,
'正文': the_text
}
work = pd.DataFrame(df)
work.to_csv('作业.csv')
print(work)