[ python] 爬虫笔记（九) 异步爬虫

目的

在爬虫中使用异步实现高性能的数据爬取

异步爬虫的方式

多线程，多进程（不建议)
- 好处：可以为相关阻塞的操作单独开启线程或者进程，阻塞操作就可以异步执行。
- 弊端：无法无限制开启多线程或者多进程。
线程池、进程池（适当使用）
- 好处：降低系统对进程或者线程创建和销毁的一个频率，从而很好降低系统的开销
- 弊端：池中线程或进程的数量有上限的。

基本使用：

import time
from multiprocessing.dummy import Pool

def get(str):
    print("正在下载:",str)
    time.sleep(2)
    print("下载成功:",str)

name_list = ['sh','ss','hh','sg']
start_time = time.time()
#实例化一个线程池对象
pool = Pool(4)

#将列表中每一个元素传递给get处理
pool.map(get,name_list)
end_time = time.time()

print(abs(start_time - end_time))

然而，在实际爬虫中并不能把所有的操作都交给线程池处理。
线程池处理的是阻塞且耗时的操作

import requests
from lxml import etree
from multiprocessing.dummy import Pool
import time
def get(url):
    print("开始爬取:", url)

    time.sleep(1)
    print("爬取成功:", url)

if __name__ == "__main__":
    ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"

    headers = {
    
    
        "User-Agent": ua,
    }

    url = "https://www.pearvideo.com/category_5"

    response = requests.get(url=url, headers=headers).text
    tree = etree.HTML(response)
    li_list = tree.xpath('//*[@id="categoryList"]/li')
    url_2 = []
    for li in li_list:
        url_2.append("https://www.pearvideo.com/" + li.xpath('//*[@id="categoryList"]/li[1]/div/a/@href')[0])
    pool = Pool(9)
    pool.map(get , url_2)

单线程+异步协程（推荐）

async
用async修饰一个方法，返回一个协程对象

基本使用：

import asyncio

async def request(url):
    print("sh")
    return url

#asyni修饰的函数，调用之后返回一个对象
c = request(url='www.baidu.com')

#创建一个事件循环对象
# loop = asyncio.get_event_loop()
#
# #将协程对象注册到loop中，然后启动loop
# loop.run_until_complete(c)

#task的使用
#基于loop创建了一个task对象
# loop = asyncio.get_event_loop()
# task = loop.create_task(c)
# print(task)
#
# loop.run_until_complete(task)
# print(task)

#future的使用
# loop = asyncio.get_event_loop()
# task = asyncio.ensure_future(c)
# print(task)
# loop.run_until_complete(task)
# print(task)

#绑定回调
def callback_func(task):
    #result返回的就是任务对象中封装的协程对象对应函数的返回值
    print(task.result())

loop = asyncio.get_event_loop()
task = asyncio.ensure_future(c)
#添加回调函数
task.add_done_callback(callback_func)
loop.run_until_complete(task)

多任务异步协程

import asyncio
import time

#多任务异步协程

def request(url):
    print("正在下载:",url)
    #在异步协程中如果出现了同步模块相关的代码，那么就无法实现异步,所以用asyncio使用手动挂起
    #time.sleep(2)
    await asyncio.sleep(2)
    print("下载完毕",url)

urls = ['www.baidu.com','www.4399.com','www.12345.com']

#任务列表需要存放多任务
stasks = []
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    stasks.append(task)#存放任务

# 开始注册任务
loop = asyncio.get_event_loop()
#需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(stasks))

要注意，requests请求是基于同步的，所以需要采用基于异步的网络请求模块进行指定url的指定发送

import requests
from lxml import etree
import time
import asyncio
#使用该模块中的ClientSession对象进行网络请求发送
import aiohttp

async def get(url):

    async with aiohttp.ClientSession() as session:
        # get(),post()
        # headers
        async with await session.get(url) as response:
            #在获取响应数据操作之前一定要使用await挂起
            page_text = await response.text()
            print(page_text)

if __name__ == "__main__":
    ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"

    headers = {
    
    
        "User-Agent": ua,
    }

    url = "https://www.pearvideo.com/category_5"

    #requests用aiohttp替换、
    response = requests.get(url=url, headers=headers).text

    tree = etree.HTML(response)
    li_list = tree.xpath('//*[@id="categoryList"]/li')
    tasks = []
    for li in li_list:
        c = get(url)
        task = asyncio.ensure_future(c)
        tasks.append(task)


    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))

[ python] 爬虫笔记（九) 异步爬虫

猜你喜欢