一、AllCitiesLink(所有城市链接)
源码
import requests
from lxml import etree
import os
url = "http://www.bendibao.com/index.htm"
user_agent = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
all_city_link_file_path = "D:\\SpringBear\\Code\\spider\\data\\AllCitiesLink.txt"
result_count = 0
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)
if os.path.exists(all_city_link_file_path):
os.remove(all_city_link_file_path)
file = open(all_city_link_file_path, 'a')
divs = e_obj.xpath("//div[@class='city-list']/dl")
for div in divs:
province_name = div.xpath("./dt/text()")
city_list = div.xpath("./dd/a/text()")
city_url_list = div.xpath("./dd/a/@href")
for i in range(len(city_list)):
result_count+=1
city_url_str = str(result_count)+"、省份:{:<10s}城市:{:<10s}链接:{}".format(
province_name[0], city_list[i], city_url_list[i])
print(city_url_str)
file.write(city_url_str + "\n")
file.close()
结果
二、CitySearchLink(省会城市具体搜索界面链接)
源码
from selenium import webdriver
import time
import requests
from lxml import etree
import os
url = "http://www.bendibao.com/index.htm"
user_agent = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
city_name_search_file_path = "D:\\SpringBear\\Code\\spider\\data\\CitySearchLink.txt"
nums_of_province = 29
nums_of_cities = 0
result_count = 0
start_province_id = 1
end_province_id = 25
start_city_id = 1
end_city_id = 1
driver = webdriver.Edge()
driver.get(url)
driver.maximize_window()
window_1 = driver.current_window_handle
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)
if os.path.exists(city_name_search_file_path):
os.remove(city_name_search_file_path)
file = open(city_name_search_file_path, 'a')
for province_id in range(start_province_id, end_province_id+1):
provice_name = e_obj.xpath(
"//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dt/text()")
cities_list = e_obj.xpath(
"//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dd/a")
nums_of_cities = len(cities_list)
for city_id in range(start_city_id, end_city_id+1):
city_name = e_obj.xpath(
"//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]/text()")
driver.find_element_by_xpath(
"//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]").click()
all_windows = driver.window_handles
for new_window in all_windows:
if new_window != window_1:
driver.switch_to.window(new_window)
window_2 = driver.current_window_handle
time.sleep(1)
search = driver.find_element_by_xpath(
"//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新落户条件及人才补贴政策")
driver.find_element_by_xpath(
"//*[@id='header']/div[3]/form/button").click()
all_windows = driver.window_handles
for new_window in all_windows:
if new_window != window_1 and new_window != window_2:
driver.switch_to.window(new_window)
window_3 = driver.current_window_handle
time.sleep(1)
current_window_url = driver.current_url
result_count += 1
city_name_search_str = str(
result_count)+"、"+provice_name[0]+":"+city_name[0]+": "+current_window_url
print(city_name_search_str)
file.write(city_name_search_str + "\n")
file.flush()
driver.close()
driver.switch_to.window(window_2)
driver.close()
driver.switch_to.window(window_1)
time.sleep(1)
file.close()
结果
三、PolicyConditionLink(省会城市政策条件链接)
源码
from selenium import webdriver
import time
import requests
from lxml import etree
import os
import re
url = "http://www.bendibao.com/index.htm"
user_agent = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
city_name_search_file_path = "D:\\SpringBear\\Code\\spider\\data\\PolicyContionLink.txt"
nums_of_province = 29
nums_of_cities = 0
result_count = 0
start_province_id = 1
end_province_id = 25
start_city_id = 1
end_city_id = 1
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)
driver = webdriver.Edge()
driver.get(url)
driver.maximize_window()
window_1 = driver.current_window_handle
if os.path.exists(city_name_search_file_path):
os.remove(city_name_search_file_path)
file = open(city_name_search_file_path, 'a')
for province_id in range(start_province_id, end_province_id+1):
provice_name = e_obj.xpath(
"//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dt/text()")
cities = e_obj.xpath(
"//*[@id='city-list']/div/div/div[3]/dl["+str(province_id)+"]/dd/a")
nums_of_cities = len(cities)
for city_id in range(start_city_id, end_city_id+1):
city_name = e_obj.xpath(
"//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]/text()")
driver.find_element_by_xpath(
"//*[@id='city-list']/div/div/div[3]/dl[" + str(province_id)+"]/dd/a["+str(city_id)+"]").click()
all_windows = driver.window_handles
for new_window in all_windows:
if new_window != window_1:
driver.switch_to.window(new_window)
window_2 = driver.current_window_handle
time.sleep(1)
search = driver.find_element_by_xpath(
"//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新人才落户补贴政策")
driver.find_element_by_xpath(
"//*[@id='header']/div[3]/form/button").click()
all_windows = driver.window_handles
for new_window in all_windows:
if new_window != window_1 and new_window != window_2:
driver.switch_to.window(new_window)
window_3 = driver.current_window_handle
time.sleep(1)
response = requests.get(driver.current_url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)
divs = e_obj.xpath("//div[@class='result-list']/a[@class='result']")
for div in divs:
titles_list = div.xpath("./div[@class='result-title']//text()")
title_website = ""
for i in range(len(titles_list)):
title_website = title_website + titles_list[i]
title_website = str(title_website)
if(title_website.find("人才") == -1):
continue
if(title_website.find("落户") == -1):
continue
if(title_website.find("政策") == -1):
continue
result_count += 1
website = div.xpath("./@href")
title_website = str(result_count) + "、" + \
title_website + ":" + website[0]
title_website = re.sub(r"\s+", "", title_website).strip()
print(title_website)
file.write(title_website + "\n")
file.flush()
driver.close()
driver.switch_to.window(window_2)
driver.close()
driver.switch_to.window(window_1)
time.sleep(1)
file.close()
结果
四、WuhanPolicyContent(武汉政策具体内容)
源码
import requests
from lxml import etree
import os
import re
url = "http://wh.bendibao.com/live/202078/113158.shtm"
user_agent = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"}
file_path = "D:\\SpringBear\\Code\\spider\\data\\WuhanPolicyContent.txt"
response = requests.get(url, headers=user_agent)
content = response.content.decode('utf8')
e_obj = etree.HTML(content)
if os.path.exists(file_path):
os.remove(file_path)
file = open(file_path, 'a')
title = e_obj.xpath("//article[@id='news-article']/h1/text()")
publish_time = e_obj.xpath(
"//article[@id='news-article']//span[@class='public_time']/text()")
author = e_obj.xpath(
"//article[@id='news-article']//span[@class='author']/text()")
lead = e_obj.xpath(
"//article[@id='news-article']//p[@class='dao']/text()")
info_str = "标题:" + title[0] + "\n" + "时间:" + publish_time[0] + "\n" + "来源:" + author[0] + "\n" + "导语:" + lead[
0] + "\n\n"
print(info_str)
file.write(info_str)
details_lists = e_obj.xpath(
"//article[@id='news-article']//div[@class='content-box']//text()")
for i in range(len(details_lists)):
details_lists[i] = re.sub(r"\s+", "", details_lists[i]).strip()
if details_lists[i] == "showtopcontent();" or details_lists[i] == "":
continue
print(details_lists[i] + "\n")
file.write(details_lists[i] + "\n\n")
file.close()
结果
五、DataCleaning(数据清洗、词云)
源码
import re
import collections
import numpy as np
import jieba
import wordcloud
from PIL import Image
import matplotlib.pyplot as plt
fn = open('D:\\SpringBear\\Code\\spider\\data\\WuhanPolicyContent.txt',
'r')
string_data = fn.read()
fn.close()
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
string_data = re.sub(pattern, '', string_data)
seg_list_exact = jieba.cut(string_data, cut_all=False)
object_list = []
remove_words = [u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了',
u'通常', u'如果', u'我们', u'需要', u'0', u'1', u'2', '3', '4', '5', '6', '7', '8', '9', '《', '》', '12']
for word in seg_list_exact:
if word not in remove_words:
object_list.append(word)
word_counts = collections.Counter(object_list)
word_counts_top10 = word_counts.most_common(10)
print(word_counts_top10)
word_counts_top10 = str(word_counts_top10)
wc = wordcloud.WordCloud(
font_path='simfang.ttf',
max_words=55,
max_font_size=150,
background_color='white',
width=800, height=600,
)
wc.generate_from_frequencies(word_counts)
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file('D:\\SpringBear\\Code\\spider\\data\\wordCloud.png')
结果