一: 所要用到的包和常量
import urllib.request from queue import Queue import time import threading from lxml import etree queue = Queue() DOWNLOADER_NUM = 10 threads = [] url = "http://sz.ganji.com/site/s/_python%20%E7%88%AC%E8%99%AB/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
二: 一级页面的处理
req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) tree = etree.HTML(res.read()) url_list = tree.xpath('//div[@class="job-wanted"]/dl/dt/a/@href') # print(url_list) for i in url_list: url = "http://sz.ganji.com/" + i queue.put(url) # print(url)
三: 二级页面的处理
def gan_spiders(url1): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url1, headers=headers) res = urllib.request.urlopen(req) tree = etree.HTML(res.read()) daiyu = tree.xpath('//div[@class="salary-line"]/b/text()') print(daiyu)
四: 函数的调用
def main(): while True: url1 = queue.get() if url1 is None: break gan_spiders(url1)
五: 线程的启动和关闭
if __name__ == '__main__': start_time = time.time() for i in range(DOWNLOADER_NUM): t = threading.Thread(target=main) t.start() threads.append(t) queue.join() for i in range(DOWNLOADER_NUM): queue.put(None) for t in threads: t.join() cost_seconds = time.time() - start_time
六: 整体代码示例
import urllib.request from queue import Queue import time import threading from lxml import etree queue = Queue() DOWNLOADER_NUM = 10 threads = [] url = "http://sz.ganji.com/site/s/_python%20%E7%88%AC%E8%99%AB/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) tree = etree.HTML(res.read()) url_list = tree.xpath('//div[@class="job-wanted"]/dl/dt/a/@href') # print(url_list) for i in url_list: url = "http://sz.ganji.com/" + i queue.put(url) # print(url) def gan_spiders(url1): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url1, headers=headers) res = urllib.request.urlopen(req) tree = etree.HTML(res.read()) daiyu = tree.xpath('//div[@class="salary-line"]/b/text()') print(daiyu) def main(): while True: url1 = queue.get() if url1 is None: break gan_spiders(url1) if __name__ == '__main__': start_time = time.time() for i in range(DOWNLOADER_NUM): t = threading.Thread(target=main) t.start() threads.append(t) queue.join() for i in range(DOWNLOADER_NUM): queue.put(None) for t in threads: t.join() cost_seconds = time.time() - start_time