Xpath爬取哈尔滨所有公交车信息
以公交路线为集合名存入Mongodb数据库
from lxml import etree
import requests
import os
import pymongo as py
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_stop_name(name, url, headers, path):
myclient = py.MongoClient('localhost', 27017)
mydb = myclient.BUS
mycollection = mydb[str(name)]
wb_data = requests.get(url, headers).text
html = etree.HTML(wb_data)
datas = html.xpath('/html/body/div[5]/div[2]/div[3]/ul[1]/li/a/text()')
cnt = 0
# with open(path, 'a', encoding='utf-8') as f:
#f.write(name + '\n')
for data in datas:
#if data == datas[-1]:
# f.write(data)
#else :
# f.write(data + '->')
info = [{'序号': cnt, '站名': data}]
mycollection.insert(info)
cnt = cnt + 1
#f.write('\n')
myclient.close()
def get_bus_num(url, headers):
bus_data = requests.get(url, headers).text
html = etree.HTML(bus_data)
names = html.xpath('/html/body/div[5]/div[2]/div[1]/div[2]/ul/li/a/text()')
urls = html.xpath('/html/body/div[5]/div[2]/div[1]/div[2]/ul/li/a/@href')
#path = 'stop.txt'
for i in range(len(names)):
get_stop_name(names[i], urls[i], headers, path)
if __name__ == '__main__':
url = 'http://haerbin.gongjiao.com/lines_all.html'
get_bus_num(url, headers)