需要fq,使用本地代理端口1080,可以使用ssr,仅供娱乐,切勿做非法用途
import requests
from lxml import etree
import multiprocessing
import os
PATH = os.getcwd()
def download(url,proxies,li,n):
""" //dd/p/a/@href"""
"""//div/ignore_js_op/img/@file"""
print('抓取:',url)
items3=list()
# req = requests.get(url, headers=headers,proxies=proxies)
req = requests.get(url, proxies=proxies).text
# print(req)
root = etree.HTML(req)
items1 = root.xpath('//dd/p[@class="mbn"]/a/@href')
items2 = root.xpath('//ignore_js_op/img/@file')
for i in items2:
url_='https://wuso.me/'+i
items3.append(url_)
items=items1+items3
with open(PATH+'\\wuso\\txt\\url_pic_page.txt','a+') as f:
for i in items:
f.write(i+'\n')
li+=items
print('爬取了{}张图'.format(len(li)))
for i in items:
r = requests.get(url=i, proxies=proxies).content
with open(PATH+'\\wuso\\{}.jpg'.format(n.value), 'wb') as f:
n.value+=1
print("保存第:{}张".format(n.value))
f.write(r)
def crul(url,proxies,li,n):
""" //*[@id="waterfall"]/li/div[@class="c cl"]/a/@href """
req=requests.get(url,proxies=proxies).content
# print(req.status_code)
root = etree.HTML(req)
items = root.xpath('//*[@id="waterfall"]/li/div[@class="c cl"]/a/@href')
# print(items) # 单个网页
with open(PATH+'\\wuso\\txt\\page.txt','a+') as f:
for i in items:
print('写入:{}'.format(i))
f.write(i+'\n')
for temp in items:
download(temp,proxies,li,n)
def main():
proxies={
'https':'https://127.0.0.1:1080'
}
p=multiprocessing.Pool(40)
# n=multiprocessing.Manager().list()
li=multiprocessing.Manager().list()
n=multiprocessing.Manager().Value('d',1)
for i in range(1,226):
url='https://wuso.me/forum-photos-{}.html'.format(i)
print("第{}层主页".format(i))
p.apply_async(crul,(url,proxies,li,n))
# crul(url,proxies)
p.close()
p.join()
if __name__ == '__main__':
try:
os.makedirs(PATH+'\\wuso\\txt\\')
except Exception as e:
pass
print(PATH+'\\wuso\\txt\\')
main()