# -*- coding: utf-8 -*-
"""
@author: Dell Created on Thu Jan 2 11:16:08 2020
"""
import gevent
from gevent import monkey
monkey.patch_all()
from lxml import etree
from selenium import webdriver
from selenium.webdriver import PhantomJS
from selenium.webdriver.chrome.options import Options
def download(url, start_idx, end_idx, file):
#需要一个phantomjs.exe路径参数,但是高版本的selenium已经将PhantomJS废弃
# driver = PhantomJS()
# 实现无界面爬取,高版本
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
for i in range(start_idx, end_idx):#循环提取页面数据
script = "javascript:goPage('"+str(i)+"')"
driver.execute_script(script)#执行Javascript代码
gevent.sleep(5)#等待页面加载完成
print("开始解析第", i, "页")
html = etree.HTML(driver.page_source)#获得的page_source是<class 'str'>
trs = html.xpath("//table[@id='report']//tbody/tr[position()>1]")
for tr in trs:
court = tr.xpath("./td[1]/font/text()")[0].strip()#法院
court_code = tr.xpath("./td[2]/font/text()")[0].strip()#法庭
lawful_day = tr.xpath("./td[3]/text()")[0].strip()#开庭日期
code = tr.xpath("./td[4]/text()")[0].strip()#案号
reason = tr.xpath("./td[5]/text()")[0].strip()#案由
undertaking_department = tr.xpath("./td[6]/div/text()")[0].strip()#承办部门
presiding_judge = tr.xpath("./td[7]/div/text()")[0].strip()#审判长/主审人
complaint = tr.xpath("./td[8]/text()")[0].strip()#原告
defendant = tr.xpath("./td[9]/text()")[0].strip()#被告
print(court,court_code,lawful_day,code,reason,undertaking_department,presiding_judge,complaint,defendant)
line = (court,court_code,lawful_day,code,reason,undertaking_department,presiding_judge,complaint,defendant)
file.write((str(line)+"\r\n").encode("utf-8", errors="ignore"))
# break
print("共有数据:", len(trs), "条")
except:
print("error")
finally:
driver.quit()#提取完成,退出浏览器
def main():
url = "http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp"
file = open("court.txt", "wb")
#每个线程抓两页数据,协程框架
gevent.joinall([
gevent.spawn(download, url, 1, 100, file),
gevent.spawn(download, url, 100, 200, file),
gevent.spawn(download, url, 200, 300, file),
gevent.spawn(download, url, 400, 500, file),
gevent.spawn(download, url, 500, 600, file),
])
file.close()
if __name__ == "__main__":
main()
pass
利用协程框架,无界面浏览器爬取上海高院开庭数据
猜你喜欢
转载自www.cnblogs.com/zxfei/p/12132362.html
今日推荐
周排行