多线程对17吉他网整站批量爬取吉他谱

环境:Python 3.6.4
import requests,re
from bs4 import BeautifulSoup
import os
from multiprocessing import Pool

def find_MaxPage():
    url = 'http://m.17jita.com/tab/img/'
    html=requests.get(url)
    html.encoding='gb2312'
    soup = BeautifulSoup(html.text, "html.parser")
    page = soup.find('a', class_='last')
    pattern='\d+'
    page=re.findall(pattern,page.text)
    return page

def down_all_image(url,path):
    pattern1='img/(.*?).html'
    page=re.findall(pattern1,url)
    for imagepage in range(1, 100):
        try:
            if imagepage == 1:
                url = 'https://www.17jita.com/tab/img/' + str(page[0]) + '.html'
            else:
                url = 'https://www.17jita.com/tab/img/' + str(page[0]) + '_' + str(imagepage) + '.html'
            html = requests.get(url)
            html.encoding = 'gb2312'
            pattern2 = '<a href=".*?" target="_blank.*?<img alt=".*?" src="(.*?)"></a>'
            imageurl = re.findall(pattern2, html.text, re.S)
            image=requests.get(imageurl[0])
            with open(path+"\%s.jpg"%imagepage,'wb') as f:
                f.write(image.content)
        except IndexError:
            print('此图谱全部下载完毕')
            break



def get_one_page_url(url):
    html = requests.get(url)
    html.encoding = 'gb2312'
    pattern = '<dt class="xs3"><a href="(.*?)" target="_blank"  style="">(.*?)</a> </dt>'
    item=re.findall(pattern,html.text,re.S)
    for i in item:
        yield {
            'url':'http://m.17jita.com/'+i[0],
            'title':i[1]
        }



def main(page):
    url='https://www.17jita.com/tab/img/index.php?page='+str(page)
    for each in get_one_page_url(url):
        print('开始下载'+str(each['title']))
        path='D:/1/'+each['title']
        os.makedirs(path)
        os.chdir(path)
        down_all_image(url=each['url'], path=path)


if __name__ == '__main__':
    Musicpage =int(find_MaxPage()[0])
    print('此网站共有'+str(Musicpage)+'页可以爬取')
    Musicpage=int(input('请问您输入爬取页数:'))
    pool = Pool()
    pool.map(main, [page for page in range(0, Musicpage+1)])
    print('任务已完成,全部下载完毕')




猜你喜欢

转载自blog.csdn.net/qq_41686130/article/details/80011754