1 import scrapy
2 from scrapy import Request
3 from selenium import webdriver
4 from selenium.webdriver.support.ui import WebDriverWait
5 # scrapy 信号相关库
6 from scrapy.utils.project import get_project_settings
7 from scrapy import signals
8 from pydispatch import dispatcher
9 # setting
10 from ..custom_settings import *
11
12 class ShanbaySpider(scrapy.Spider):
13 name = 'shanbay'
14 allowed_domains = ['shanbay.com']
15 # start_urls = ['http://shanbay.com/']
16 custom_settings = custom_settings_for_spider1
17 # 将Chrome初始化放到spider中,成为spider中的元素
18 def __init__(self, timeout=30, isLoadImage=True, windowHeight=None, windowWidth=None):
19 # 从settings.py中获取设置参数
20 print("浏览器开始执行")
21 self.mySetting = get_project_settings()
22 self.timeout = self.mySetting['SELENIUM_TIMEOUT']
23 self.isLoadImage = self.mySetting['LOAD_IMAGE']
24 self.windowHeight = self.mySetting['WINDOW_HEIGHT']
25 self.windowWidth = self.mySetting['windowWidth']
26 # 初始化Chrome对象
27 self.browser = webdriver.Chrome()
28 print("六拉你去")
29 if self.windowHeight and self.windowHeight:
30 self.browser.set_window_size(900, 900)
31 self.browser.set_page_load_timeout(self.timeout)
32 self.wait = WebDriverWait(self.browser, 25)
33 # 初始化父类,方便不同爬虫文件执行不同执行方式
34 super(ShanbaySpider, self).__init__()
35 # 设置信号量,当收到spider_closed信号时,调用mySpiderCloseHandle方法,关闭chrome
36 dispatcher.connect(receiver=self.CloseHandle,
37 signal=signals.spider_closed
38 )
39
40 # 信号量处理函数:关闭chrome浏览器
41 def CloseHandle(self, spider):
42 print(f"CloseHandle:enter")
43 self.browser.quit()
44
45 # ------------ spider 开始执行 --------------
46 # --- 网络请求 ---
47 def start_requests(self):
48 for i in range(29):
49 page = 540709 + i * 3
50 url_base = 'https://www.shanbay.com/wordlist/187711/' + str(page) + '/?page={}'
51 for x in range(10):
52 url = url_base.format(x+1)
53 yield Request(
54 url,
55 meta={'usedSelenium': True, 'dont_redirect': True},
56 callback=self.parse,
57 errback=self.error,
58 )
59 def error(self, response):
60 pass
61 def parse(self, response):
62 from ..items import ShanbaySpiderItem
63 html_contenxs = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div/table/tbody/tr//*/text()')
64 item = ShanbaySpiderItem()
65
66 for result in html_contenxs:
67 item['Chinese'] = result.extract()
68 print(item['Chinese'])
69 yield item