# 设置2个队列,解析线程的退出标志 # 创建解析线程,启动并保存到数据到本地 # 队列所,保证任务执行结束 # 设置退出标志为true,退出线程 # 关闭文件 # 阿里社招信息爬取难点-------------1网页需要操作点击才能获取真正的下一页,2匹配内容的时候,js文件获取不到 import random import threading from urllib.error import URLError from lxml import etree import requests import queue import time import json # 创建队列spider spider_queue = queue.Queue() parse_queue = queue.Queue() parse_exit_flag = False lock = threading.Lock() agent_list=['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36", ] class SpiderThread(threading.Thread): def __init__(self, s_id, s_queue, p_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.sid = s_id self.spider_queue = s_queue self.parse_queue = p_queue def run(self): # time.sleep(10) # 循环从spider_queue中取 while True: time.sleep(3) if self.spider_queue.empty(): break # 用requests读取数据 url = self.spider_queue.get(block=False) headers = { 'User-Agent': random.choice(agent_list) } try: response = requests.get(url=url, headers=headers) # 将数据放到parse_queue self.parse_queue.put(response.text) print('%d线程爬取数据%s' % (self.sid, url)) except URLError as e: print(e, '网络请求错误') finally: self.spider_queue.task_done() class ParseThread(threading.Thread): def __init__(self, p_id, fp, p_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.pid = p_id self.fp = fp self.parse_queue = p_queue def run(self): # 循环解析 while True: global parse_exit_flag if parse_exit_flag: break try: data = self.parse_queue.get(block=False) # 获取到的数据为源码,而源码里跟前端显示代码不同,没法匹配 print(data) self.parse(data) print('%d线程解析数据成功' % self.pid) # print(data[:30]) # 解析完成后再发信号 self.parse_queue.task_done() except Exception: pass def parse(self, data): # 解析页面 items = [] my_tree = etree.HTML(data) # 职位名称, 详情链接 job_name_list = my_tree.xpath('//tbody[@id="J-list-box"]/tr/td/span/a/text()') detail_url_list = my_tree.xpath('//tbody[@id="J-list-box"]/tr/td/span/a/@href') for i in range(len(job_name_list)): item = { 'job_name': job_name_list[i], 'detail_url': detail_url_list[i], } items.append(item) print(items) # 写入文件,加锁 with lock: self.fp.write(json.dumps(items, ensure_ascii=False) + '\n') def main(): base_url = 'https://job.alibaba.com/zhaopin/positionList.htm?keyWord=cHl0aG9u&_input_charset=UTF-8#page/1' for i in range(1, 2): url = base_url + str(i) spider_queue.put(url) # spider线程 time.sleep(5) for i in range(4): SpiderThread(i, spider_queue, parse_queue).start() # 解析数据并保存文件的线程 fp = open('./data.txt', 'w', encoding='utf-8') for i in range(4): ParseThread(i, fp, parse_queue).start() # 队列锁 spider_queue.join() parse_queue.join() # 设置退出标志 global parse_exit_flag parse_exit_flag = True # 关闭文件 fp.close() if __name__ == '__main__': main()
爬取阿里招聘信息
猜你喜欢
转载自blog.csdn.net/jiangwei1102/article/details/80790141
今日推荐
周排行