import requests
from lxml import etree
items = []
import time
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
def main():
#爬取第一页所有的导航链接
N_list = parse_first()
#print(N_list)
#爬取二级页面
parse_second(N_list)
#
fp = open('天津公交.txt','w',encoding='utf8')
for item in items:
fp.write(str(item) + '\n')
fp.close()
def parse_first():
url = 'https://tianjin.8684.cn/'
r = requests.get(url, headers=header)
#解析内容,获取所有的导航链接
tree= etree.HTML(r.text)
#获取以数字开头的链接
number_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
#查找以字母开头的链接
char_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
return number_list + char_list
def parse_second(list):
#遍历列表依次发送请求,解析内容,获取所有的公交路线url
#print(list)
for N in list:
#print(N)
url = 'https://tianjin.8684.cn' + N
#print(url)
r = requests.get(url, headers=header)
#解析内容,获取每一路的公交的url
parse_third(r.text)
def parse_third(r_text):
tree = etree.HTML(r_text)
route = tree.xpath('//div[@class="stie_list"]/a/@href')
#print(route)
# 遍历列表
for R in route:
url = 'https://tianjin.8684.cn' + R
#print(url)
r2 = requests.get(url, headers=header)
#print(r2.text)
# 解析内容,获取每一路的公交的详细信息
parse_details(r2.text)
def parse_details(r_text):
tree = etree.HTML(r_text)
#print(tree.text)
bus_name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
work_time = tree.xpath('//div[@class="bus_i_content"]/p[1]/text()')[0]
Fare_information = tree.xpath('//div[@class="bus_i_content"]/p[2]/text()')[0]
update_time = tree.xpath('//div[@class="bus_i_content"]/p[4]/text()')[0]
#bus_stop = tree.xpath('//div[@class="bus_line_site "][1]/div[@class="bus_site_layer"]/div/i/text()')
bus_stop = tree.xpath('//div[@class="bus_line_site "][1]/div[@class="bus_site_layer"]/div/a/text()')
print(len(bus_stop))
item = {
'公交车名': bus_name,
'工作时间': work_time,
'售票信息': Fare_information,
'更新时间': update_time,
'公交站点': bus_stop
}
items.append(item)
if __name__ =='__main__':
main()
Python爬虫 - 爬取公交线路
猜你喜欢
转载自blog.csdn.net/fangweijiex/article/details/103788270
今日推荐
周排行