版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_39591494/article/details/89324739
AIOHTTP
简单获取:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import asyncio
import aiohttp
async def main():
async with aiohttp.ClientSession() as session:
async with session.get('http://www.httpbin.org/get') as resp:
print(resp.status)
print(await resp.json())
print(await resp.text())
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
# print(loop.run_until_complete(main()))
200
{'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'www.httpbin.org', 'User-Agent': 'Python/3.6 aiohttp/3.5.4'}, 'origin': '221.218.215.124, 221.218.215.124', 'url': 'https://www.httpbin.org/get'}
{
"args": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Host": "www.httpbin.org",
"User-Agent": "Python/3.6 aiohttp/3.5.4"
},
"origin": "221.218.215.124, 221.218.215.124",
"url": "https://www.httpbin.org/get"
}
多进程+协程下载图片:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import asyncio
import aiohttp
import string
import requests
import time
import random
import logging
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
logging.basicConfig(
level=logging.DEBUG,
format='%(threadName)-10s:%(message)s'
)
TEST_URLS = [
'https://source.unsplash.com/random',
'https://source.unsplash.com/user/erondu/1600x900',
'http://via.placeholder.com/350x150',
'http://via.placeholder.com/350x150/1c2b3c/999',
]
DOWNLOAD_DIR = os.path.join(BASE_DIR, "download")
def make_temp_name(count=5, f='.jpg'):
logging.debug('make_temp_name is start....')
return ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(count)]) + f
async def download_image(url):
logging.debug('download_image start.....')
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
filename = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
with open(filename, "wb") as f:
while True:
image = await resp.content.read(1024)
if not image:
break
else:
f.write(image)
def main():
start_time = time.time()
look = asyncio.get_event_loop()
# p = Pool(3)
# for url in TEST_URLS:
# result = p.apply_async(look.run_until_complete(download_image(url)))
try:
with ProcessPoolExecutor() as executor:
for url in TEST_URLS:
look.run_until_complete(download_image(url))
except Exception as e:
print(e)
print(f"爬取完成,用时时间:{time.time() - start_time}秒....")
if __name__ == "__main__":
log_to_stderr()
get_logger()
main()
>>>
MainThread:Using selector: SelectSelector
[DEBUG/MainProcess] created semlock with handle 460
[DEBUG/MainProcess] created semlock with handle 420
[DEBUG/MainProcess] Queue._after_fork()
[DEBUG/MainProcess] created semlock with handle 756
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:download_image start.....
MainThread:make_temp_name is start....
爬取完成,用时时间:9.381403923034668秒....
[INFO/MainProcess] process shutting down
[DEBUG/MainProcess] running all "atexit" finalizers with priority >= 0
[DEBUG/MainProcess] running the remaining "atexit" finalizers
多线程+协程(Mark2)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import asyncio
import aiohttp
import string
import requests
import time
import random
import logging
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
logging.basicConfig(
level=logging.DEBUG,
format='%(threadName)-10s:%(message)s'
)
TEST_URLS = [
'https://source.unsplash.com/random',
'https://source.unsplash.com/user/erondu/1600x900',
'http://via.placeholder.com/350x150',
'http://via.placeholder.com/350x150/1c2b3c/999',
'http://img5.imgtn.bdimg.com/it/u=796460492,3306564261&fm=26&gp=0.jpg',
'http://img2.imgtn.bdimg.com/it/u=1782917320,24227842&fm=26&gp=0.jpg',
'http://img1.imgtn.bdimg.com/it/u=2665441243,1857925582&fm=26&gp=0.jpg',
'http://img2.imgtn.bdimg.com/it/u=2931291472,233235010&fm=26&gp=0.jpg',
'http://img5.imgtn.bdimg.com/it/u=744077169,3705624624&fm=26&gp=0.jpg',
'http://img2.imgtn.bdimg.com/it/u=872389273,3559301897&fm=26&gp=0.jpg',
'http://img0.imgtn.bdimg.com/it/u=1404919429,1733398877&fm=26&gp=0.jpg',
'http://img1.imgtn.bdimg.com/it/u=2082542552,294635837&fm=26&gp=0.jpg',
'https://www.baidu.com/img/bd_logo1.png'
]
DOWNLOAD_DIR = os.path.join(BASE_DIR, "download")
def make_temp_name(count=5, f='.jpg'):
logging.debug('make_temp_name is start....')
return ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(count)]) + f
def afetch_url():
return TEST_URLS
async def download_image(url, loop):
logging.debug('download_image start.....')
async with aiohttp.ClientSession() as session:
async with session.get(url) as 2 resp:
filename = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
with open(filename, "wb") as f:
while True:
image = await resp.content.read(1024)
if not image:
break
else:
f.write(image)
async def acrawler(loop):
logging.debug('starting async crawler...')
urls = afetch_url()
tasks = [download_image(url, loop) for url in urls]
await asyncio.gather(*tasks)
def download_images_2():
start_time = time.time()
for i in TEST_URLS:
image_name = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
url_data = requests.get(i).content
if url_data:
with open(image_name, 'wb') as f:
f.write(url_data)
print(f"爬取完成,用时时间:{time.time() - start_time}秒....")
# def run_async_crawler():
# loop = asyncio.get_event_loop()
# loop.run_until_complete(acrawler(loop))
def get_download():
start_time = time.time()
loop = asyncio.get_event_loop()
try:
with ProcessPoolExecutor(4) as executor:
loop.run_until_complete(acrawler(loop))
except Exception as e:
print(e)
print(f"爬取完成,用时时间:{time.time() - start_time}秒....")
# for url in TEST_URLS:
# look = asyncio.get_event_loop()
# look.run_until_complete(download_image(url))
# try:
# with ProcessPoolExecutor(4) as executor:
# for url in TEST_URLS:
# look.run_until_complete(download_image(url))
# except Exception as e:
# print(e)
if __name__ == "__main__":
log_to_stderr()
get_logger()
get_download()
download_images_2()
>>>
MainThread:Using selector: SelectSelector
[DEBUG/MainProcess] created semlock with handle 568
[DEBUG/MainProcess] created semlock with handle 848
[DEBUG/MainProcess] Queue._after_fork()
[DEBUG/MainProcess] created semlock with handle 476
MainThread:starting async crawler...
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
爬取完成,用时时间:6.389447927474976秒....
加强封装,Class:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import asyncio
import aiohttp
import sys
import string
import requests
import time
import random
import logging
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
logging.basicConfig(
level=logging.DEBUG,
format='%(threadName)-10s:%(message)s'
)
TEST_URLS = [
'https://source.unsplash.com/random',
'https://source.unsplash.com/user/erondu/1600x900',
'http://via.placeholder.com/350x150',
'http://via.placeholder.com/350x150/1c2b3c/999',
'http://img5.imgtn.bdimg.com/it/u=796460492,3306564261&fm=26&gp=0.jpg',
'http://img2.imgtn.bdimg.com/it/u=1782917320,24227842&fm=26&gp=0.jpg',
'http://img1.imgtn.bdimg.com/it/u=2665441243,1857925582&fm=26&gp=0.jpg',
'http://img2.imgtn.bdimg.com/it/u=2931291472,233235010&fm=26&gp=0.jpg',
'http://img5.imgtn.bdimg.com/it/u=744077169,3705624624&fm=26&gp=0.jpg',
'http://img2.imgtn.bdimg.com/it/u=872389273,3559301897&fm=26&gp=0.jpg',
'http://img0.imgtn.bdimg.com/it/u=1404919429,1733398877&fm=26&gp=0.jpg',
'http://img1.imgtn.bdimg.com/it/u=2082542552,294635837&fm=26&gp=0.jpg',
'https://www.baidu.com/img/bd_logo1.png'
]
DOWNLOAD_DIR = os.path.join(BASE_DIR, "download")
class Async:
def __init__(self, count, filem):
self.count = count
self.filem = filem
self.loop = asyncio.get_event_loop()
def make_temp_name(self):
logging.debug('make_temp_name is start....')
return ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(self.count)]) + self.filem
def afetch_url(self):
return TEST_URLS
async def download_image(self, url):
logging.debug('download_image start.....')
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
filename = os.path.join(DOWNLOAD_DIR, self.make_temp_name() + self.filem)
with open(filename, "wb") as f:
while True:
image = await resp.content.read(1024)
if not image:
break
else:
f.write(image)
async def acrawler(self, loop):
logging.debug('starting async crawler...')
urls = self.afetch_url()
tasks = [self.download_image(url) for url in urls]
await asyncio.gather(*tasks)
def get_download(self):
start_time = time.time()
try:
with ProcessPoolExecutor(4) as executor:
self.loop.run_until_complete(self.acrawler(self.loop))
except Exception as e:
print(e)
print(f"爬取完成,用时时间:{time.time() - start_time}秒....")
def download_images_2(self):
start_time = time.time()
for i in TEST_URLS:
image_name = os.path.join(DOWNLOAD_DIR, self.make_temp_name() + self.filem)
url_data = requests.get(i).content
if url_data:
with open(image_name, 'wb') as f:
f.write(url_data)
print(f"爬取完成,用时时间:{time.time() - start_time}秒....")
def main():
log_to_stderr()
get_logger()
while True:
menu = {
"1" : "普通爬取",
"2" : "多进程+协程爬取",
"Q" : "退出"
}
for k, v in menu.items():
print(f"{k} : {v}")
Your = input("请您输入:").strip().upper()
if Your == "1":
A = Async(5, '.jpg')
A.download_images_2()
elif Your == "2":
A = Async(5, '.jpg')
A.get_download()
elif Your == "Q":
sys.exit()
if __name__ == "__main__":
main()
# TODO: 测试笔记(Mark)
#-------------------------------------
# def download_images_2():
# start_time = time.time()
# for i in TEST_URLS:
# image_name = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
# url_data = requests.get(i).content
# if url_data:
# with open(image_name, 'wb') as f:
# f.write(url_data)
# print(f"爬取完成,用时时间:{time.time() - start_time}秒....")
# def run_async_crawler():
# loop = asyncio.get_event_loop()
# loop.run_until_complete(acrawler(loop))
# TODO: 测试笔记(Mark)
# ------------------------------
# for url in TEST_URLS:
# look = asyncio.get_event_loop()
# look.run_until_complete(download_image(url))
# try:
# with ProcessPoolExecutor(4) as executor:
# for url in TEST_URLS:
# look.run_until_complete(download_image(url))
# except Exception as e:
# print(e)
Mark(html)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import asyncio
import aiohttp
import logging
from bs4 import BeautifulSoup
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor
logging.basicConfig(
level=logging.DEBUG,
format='%(threadName)-10s:%(message)s'
)
class Async:
def __init__(self, URL, file_name):
self.URL = URL
self.file_name = file_name
self.loop = asyncio.get_event_loop()
async def get_url(self):
logging.debug('get_url start....')
async with aiohttp.ClientSession() as session:
async with session.get(self.URL) as resp:
data = await resp.text()
self.save_html(data)
def save_html(self, data):
logging.debug('save_html function start.....')
with open(self.file_name, 'w') as f:
if data:
f.write(data)
else:
return False
def run_url(self):
logging.debug('run_url function start.....')
try:
with ProcessPoolExecutor(4) as executor:
self.loop.run_until_complete(self.get_url())
except Exception as e:
print(e)
if __name__ == "__main__":
log_to_stderr()
get_logger()
A = Async('https://edu.51cto.com/t/user/ianswer/id-3096.html', 'yankai.txt')
A.run_url()