环境:Python 3.6.4
import requests,re from bs4 import BeautifulSoup import os from multiprocessing import Pool def find_MaxPage(): url = 'http://m.17jita.com/tab/img/' html=requests.get(url) html.encoding='gb2312' soup = BeautifulSoup(html.text, "html.parser") page = soup.find('a', class_='last') pattern='\d+' page=re.findall(pattern,page.text) return page def down_all_image(url,path): pattern1='img/(.*?).html' page=re.findall(pattern1,url) for imagepage in range(1, 100): try: if imagepage == 1: url = 'https://www.17jita.com/tab/img/' + str(page[0]) + '.html' else: url = 'https://www.17jita.com/tab/img/' + str(page[0]) + '_' + str(imagepage) + '.html' html = requests.get(url) html.encoding = 'gb2312' pattern2 = '<a href=".*?" target="_blank.*?<img alt=".*?" src="(.*?)"></a>' imageurl = re.findall(pattern2, html.text, re.S) image=requests.get(imageurl[0]) with open(path+"\%s.jpg"%imagepage,'wb') as f: f.write(image.content) except IndexError: print('此图谱全部下载完毕') break def get_one_page_url(url): html = requests.get(url) html.encoding = 'gb2312' pattern = '<dt class="xs3"><a href="(.*?)" target="_blank" style="">(.*?)</a> </dt>' item=re.findall(pattern,html.text,re.S) for i in item: yield { 'url':'http://m.17jita.com/'+i[0], 'title':i[1] } def main(page): url='https://www.17jita.com/tab/img/index.php?page='+str(page) for each in get_one_page_url(url): print('开始下载'+str(each['title'])) path='D:/1/'+each['title'] os.makedirs(path) os.chdir(path) down_all_image(url=each['url'], path=path) if __name__ == '__main__': Musicpage =int(find_MaxPage()[0]) print('此网站共有'+str(Musicpage)+'页可以爬取') Musicpage=int(input('请问您输入爬取页数:')) pool = Pool() pool.map(main, [page for page in range(0, Musicpage+1)]) print('任务已完成,全部下载完毕')