工具:python3
本文主要用python实现动漫之家的爬取
如果有安装模块疑问请自行百度,有代码格式问题,也可以在下方回复
分为两部分跟java那部分一样分为无框架和scrapy框架实现,步骤和实现原理都是一样的可以参考前篇
代码不再详细注释
无框架
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options from urllib.request import urlretrieve from urllib.request import build_opener from urllib.request import install_opener import os import time if __name__ == "__main__": chrome_options = Options() chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--headless') abspath = os.path.abspath(r"D:\chromedriver_win32\chromedriver.exe") dr = webdriver.Chrome(executable_path=abspath,chrome_options=chrome_options) dr.get("https://manhua.dmzj.com/shiling") element=dr.find_elements_by_xpath("//div[@class='cartoon_online_border']/ul/li/a") urllist=[] for sel1 in element: urllist.append(sel1.get_attribute("href")) print(dr.title) imglist=[] for list2 in urllist: print (list2) dr.get(list2) element2=dr.find_elements_by_xpath("//div[@class='btmBtnBox']/select/option") for sel2 in element2: title=dr.find_elements_by_xpath("//div[@class='display_middle']/h1/a")[0].text+dr.find_elements_by_xpath("//div[@class='display_middle']/span")[0].text; imglist.append(title+'---https:'+sel2.get_attribute("value")+'---'+sel2.text) dr.close() dr.quit() total=len(imglist) index_dest="D:/manhua" if not os.path.exists(index_dest) : os.makedirs(index_dest) else: print(index_dest+"已创建") j=0 for list3 in imglist: imgdest=index_dest+list3.split('---')[0].split('-')[0].strip() if not os.path.exists(imgdest): os.makedirs(imgdest) else: print(imgdest+"已创建") opener=build_opener() opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36'),("Referer","https://manhua.dmzj.com/")] install_opener(opener) img_url=list3.split('---')[1] filename=imgdest+"/"+list3.split('---')[2]+"."+img_url.split('.')[len(img_url.split('.'))-1] j+=1 if filename not in os.listdir(): urlretrieve(url = img_url,filename = filename) else: print(filename+"已下载") print(j+"/"+total) time.sleep(1) print("下载完成")
Scrapy框架
下期见