初学Python爬虫时都会从最简单的方法开始,以下为几种常见的基础做法。
""" 简单的循环处理 """ import requests url_list = [ "https://www.baidu.com", "https://www.cnblogs.com/" ] for url in url_list: result = requests.get(url) print(result.text) """ 线程池处理 """ import requests from concurrent.futures import ThreadPoolExecutor def fetch_request(url): result = requests.get(url) print(result.text) url_list = [ "https://www.baidu.com/", "https://www.cnblogs.com/" ] pool = ThreadPoolExecutor(10) for url in url_list: # 线程池中获取线程,执行fetch_request方法 pool.submit(fetch_request, url) # 关闭线程池 pool.shutdown() """ 线程池+回调函数 """ import requests from concurrent.futures import ThreadPoolExecutor def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result().text) url_list = [ "https://www.baidu.com/", "https://www.cnblogs.com/" ] pool = ThreadPoolExecutor(10) for url in url_list: v = pool.submit(fetch_async, url) # 调用回调函数 v.add_done_callback(callback) pool.shutdown() """ 进程池处理 """ import requests from concurrent.futures import ProcessPoolExecutor def fetch_requst(url): result = requests.get(url) print(result.text) url_list = [ "https://www.baidu.com/", "https://www.cnblogs.com/" ] if __name__ == '__main__': pool = ProcessPoolExecutor(max_workers=10) for url in url_list: pool.submit(fetch_requst, url) pool.shutdown() """ 进程池+回调函数 """ import requests from concurrent.futures import ProcessPoolExecutor def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result().text) url_list = [ "https://www.baidu.com/", "https://www.cnblogs.com/" ] if __name__ == '__main__': pool = ProcessPoolExecutor(10) for url in url_list: v = pool.submit(fetch_async, url) v.add_done_callback(callback) pool.shutdown()