import urllib.request import re import os import socket from urllib.error import URLError from urllib.error import HTTPError class PaChong():def url_open(self, url, encodestr): # 打开网址 try: req = urllib.request.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0') response = urllib.request.urlopen(req) html = response.read().decode(encodestr) return html except socket: print("时间超时") except HTTPError as e: print('HTTPError code:', e.code) except URLError as e: print('URLError Reason:', e.reason) def work_dir(self, stringname): # 工作目录 try: os.makedirs(r'c:/图片/' + stringname) except FileExistsError: pass os.chdir(r'c:/图片/' + stringname) def page_bufen(self, html): # 获取部分网页地址 p = r"<li><a href='(.*?(?:\.html){0,1})'" page_bufen_list = re.findall(p, html) return page_bufen_list def save_photo(self, html): # 保存页面图片 print("保存图片") p = r'<img.*?src="(.*?\.jpg)"' photo_list = re.findall(p, html) print(photo_list) for each in photo_list: strdir = each.split("/")[-3] + "/" + each.split("/")[-2] self.work_dir(strdir) filename = each.split("/")[-1] try: urllib.request.urlretrieve(each, filename) except HTTPError as e: print('HTTPError urlretrieve阶段Error code:', e.code) except URLError as e: print('URLError urlretrieve阶段Reason:', e.reason) def section_del(self, del_str, fenge): # 网址删除部分 section_del_list = del_str.split(fenge) section_del_list.pop() del_str = (fenge ).join(section_del_list) + fenge return del_str def photo_groud_page(self, url, encodestr): # 一组图片的所有网页,图片全部下载 html = self.url_open(url, encodestr) page_list = self.page_bufen(html) for each in page_list: if each == page_list[0]: each = url html_each = self.url_open(each, encodestr) else: strhead = self.section_del(url, "/") url_each = strhead + each html_each = self.url_open(url_each, encodestr) self.save_photo(html_each) def page_full(self, html): # 获取网页地址 p = r'<li><a href="(.*?(?:\.html){0,1})"' page_first_list = re.findall(p, html) return page_first_list def firt_page_yeshu(self, html): # 获取抓取页面总页数部分网址 p = r"<li><a href='(.*?(?:\.html){0,1})'" firt_page_yeshu_list = re.findall(p, html) print(firt_page_yeshu_list) part1 = firt_page_yeshu_list[-1] part2 = part1.split(".")[0] part3 = part2.split("_")[-1] page_yeshu_list = [] for each in range(int(part3)): if each == 0: continue if each == 1: continue else: tempstr = part2.split("_")[0]+"_"+part2.split("_")[1]+"_"+str(each)+".html" page_yeshu_list.append(tempstr) return page_yeshu_list def workstart(self, url, encodestr): # 开始工作 html = self.url_open(url, encodestr) first_list = self.page_full(html) for each in first_list: # 首页抓取图片 self.photo_groud_page(each, encodestr) page_yeshu_list = self.firt_page_yeshu(html) # 其他页抓取图片 for each in page_yeshu_list: each_a = url + each print(each_a) html_a = self.url_open(each_a, encodestr) other_list = self.page_full(html_a) print(other_list) for each_b in other_list: self.photo_groud_page(each_b, encodestr) if __name__ == '__main__': pc = PaChong() pc.workstart("http://www.169tp.com/guoneimeinv/", "gbk")
网络爬虫之爬取图片
猜你喜欢
转载自www.cnblogs.com/GhostVip/p/8947884.html
今日推荐
周排行