漫画批量下载

#!/usr/bin/env python  
# encoding: utf-8  
#!/usr/bin/env python
# encoding: utf-8
from requests_html import HTMLSession
import aiohttp
import asyncio
import hashlib
import os
import re
from traceback import format_exc
from multiprocessing import Pool as ThreadPool
import base64
from cryptography.fernet import Fernet
#开始索引数
strat_num=227000
#结束索引数
end_num=250606
key="X0JxSkg4NFVBQVBPODlUM0VzT1liNnloeWtLcndkSldRT2xURzQ4MEM5RT0="
page_num_pat=re.compile("var picCount.=.(.*?);")
page_id_pat=re.compile("picAy\[0\].=.(.*?);")

def aes_cbc_decrypt(message):
     decrypted_text = Fernet(base64.b64decode(key).decode("utf8")).decrypt(bytes("{}".format(message),encoding="utf8"))
     return decrypted_text.decode("utf8")

#漫画题目
cosmic_name="//head//title/text()"
#漫画id
cosmic_id="//img[@id='curPic']/@src"
main_url=aes_cbc_decrypt("gAAAAABbNdhqCnxkaJwZ2VL7HUXne_IOic-NsHtE30W-J68oecVmgm0dzO_lLXgTlI7a5_NbUWlkGm7FqLwY81XIBddNWbac4rCgBA9NFAECsNISkhTvdRl4uDSaS6bHY8sbcJJwO13Z")
cosmic_urllist=[main_url.format(i) for i in range(strat_num,end_num+1)]
pagenum_xpath="//font[@id='TotalPage']/text()"
full_url=aes_cbc_decrypt("gAAAAABbNdk5FLeX55hOiDAXxgCwwYmGrokYvU3Nd1AOYuOE7OdIEcBdAmSG_Q3kOltealBKMOgUBKDuPUJtzFFPwqoxL-FUip"
                         "VNQU-JmBW_K5qxgzTQ3IOla_F61Rscy0fJOaN-mEXKPqrakctyDRN7OVm1LARTMhylQELLuBnJgIT4WXilchg=") #漫画的总id，序号的id和格式使用(jpg)
session=HTMLSession()
sema = asyncio.Semaphore(5)
session=HTMLSession()

async def getbuff(url,c_name):
    conn=aiohttp.TCPConnector(verify_ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        async with session.get(url,timeout=15) as r:
            buff=await r.read()
            if not len(buff):
                url = url.replace(".jpg", ".png")
                async with session.get(url, timeout=15) as r2:
                    buff = await r2.read()
            print("nowurl:", url)
            getimg(url,buff,c_name)

async def run(url,c_name):
        with (await sema):
            await getbuff(url,c_name)
#
def spider(url):
    try:
       req=session.get(url,timeout=15)
       if req.status_code==200:
           root=req.html
           name=root.xpath(cosmic_name)[0]
           id=page_id_pat.findall(req.text)[0].split('/')[-2]
           max_page=page_num_pat.findall(req.text)[0]
           full_urllist = [full_url.format(id, i, "jpg") for i in range(1, int(max_page)+1)]
           event_loop = asyncio.get_event_loop()
           tasks = [run(url,name) for url in full_urllist]
           results = event_loop.run_until_complete(asyncio.wait(tasks))
    except:
        print(format_exc())

def getimg(url,buff,c_name):
    #题目那层目录
    filepath = os.path.join(os.getcwd(), "/comics_images",c_name)
    #如果标题太长就转md5，然后单独启动一个text写入内容为标题
    md5name = hashlib.md5(c_name.encode("utf-8")).hexdigest()
    filepath2 = os.path.join(os.getcwd(), "/comics_images", md5name)

    id = url.split('/')[-1]
    image_id = os.path.join(filepath, id)
    image_id2=os.path.join(filepath2, md5name)

    #题目层目录是否存在
    if not os.path.exists(filepath) and not os.path.exists(filepath2):
        #文件是否存在
            try:
               os.makedirs(filepath)
            except:
               os.makedirs(filepath2)
               image_id=image_id2
               with open(os.path.join(filepath2,"title.txt"),"w",encoding="utf-8") as fs:
                    fs.write(c_name)

            if not os.path.exists(image_id) and not os.path.exists(image_id2):
               print("savepath:",image_id)
               with open(image_id, 'wb') as fs:
                      fs.write(buff)


if __name__ == '__main__':
    with ThreadPool(4) as pool:
        pool.map(spider,cosmic_urllist)
猜你喜欢