在之前已经讲过如何获取所爬取数据的页面,现在就是如何获取页面中的所需信息。这一部分主要是问题是数据定位和网页翻页。获取数据定位主要用Xpath语法定位,观察规律改变语句的参数可以获取同一个数据模式下的所有数据。当然这一部分还分动态获取数据和静态获取数据,对于动态数据是无法通过Xpath获取的,必须要使用正则表达式,静态数据使用Xpath没有问题。
网页翻页也主要有两种方式,一种是网页的url会随着翻页变化这样就会发现网页规律,写个函数跟着网页变化一下就OK了。麻烦的是翻页网页的url并不会改变,关于随着网页翻页url不变的方法,可以看着篇文章翻页。这样的的话就需要用post方法获取网页数据。下面的实例就是基于post方法:注意header中所有的都需要用字符串格式。
import requests
import pandas as pd
from lxml import etree
if __name__ == '__main__':
if False:
name_list = []
region_list = []
count = 0
for i in range(1, 648): # default 648
form_data = {
"selectedJSON": "{\"alpha\": \"ALL\", \"menu\": \"ALPHABETICAL\", \"gender\": \"All\", \"currPageNum\": 1,\"breadCrumbs\": [{\"breadCrumb\": \"Alphabetical+Listing+\"}],\"helpText\": \"Click+on+any+of+the+alphabet+letters+to+view+a+list+of+Fellows.\"}",
"inputFilterJSON": "{\"sortOnList\": [{\"sortByField\": \"fellow.lastName\", \"sortType\": \"ASC\"}],\"requestedPageNumber\": \"1\", \"typeAhead\": False}",
'pageNum': i
}
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN, zh;q = 0.9",
"Connection": "keep-alive",
"Content-Length": "495",
"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
"Host": "services27.ieee.org",
"Origin": "https://services27.ieee.org",
"Referer": "https://services27.ieee.org/fellowsdirectory/menuALPHABETICAL.html",
"sec-ch-ua": "\" Not A;Brand\";v = \"99\", \"Chromium\";v=\"90\",\"Google Chrome\";v =\"90\"",
"sec-ch-ua-mobile": "/?0",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "自己电脑显示的",
"X-Requested-With": "XMLHttpRequest"
}
url = 'https://services27.ieee.org/fellowsdirectory/getpageresultsdesk.html'
response = requests.post(url=url, data=form_data, headers=headers)
page_text = response.text
page_content = etree.HTML(page_text)
name = page_content.xpath("/html/body/div[1]/div/div[2]/a/span/text()")
name_list = name_list + name
region = page_content.xpath("/html/body/div[1]/div/div[3]/a/text()")
region = [name.strip() for name in region]
region_list = region_list + region
count = count + 1
print("count", count)
data_frame = {
"name": name_list,
"region": region_list
}
pd.DataFrame(data_frame).to_csv("name.csv", index=False)
else:
url = 'https://awards.acm.org/fellows/award-winners'
# step_2::发起请求-
# get方法会返回一个响应对象
response = requests.get(url=url)
# step_3:获取相应数据.text返回的是字符串形式的响应数据
page_text = response.text
page_content = etree.HTML(page_text)
name = page_content.xpath("//*[@id=\"SkipTarget\"]/div[2]/div[1]/div/div/div/div/div/table/tbody/tr/td[1]/a/text()")
name = [s.replace('\xa0', ' ') for s in name]
print(name)
# step_4:持久化存储
data_frame = {
"name": name
}
pd.DataFrame(data_frame).to_csv("ACM_name.csv", index=False)