第一步,读取单页面:
import urllib.request url = "http://www.badtom.cn" data = urllib.request.urlopen(url).read() data = data.decode('UTF-8') print(data)
第二步,对前篇单机版伪代码的简单实现:
from collections import deque import re import urllib.request queue = deque() visited = set() init_url = "http://www.badtom.cn" queue.append(init_url) visited.add(init_url) count = 0 while queue: url = queue.popleft() print('已经抓取:' + str(count) + '个,正在抓取-->' + url) count += 1 try: urlop = urllib.request.urlopen(url,timeout = 2) data = urlop.read().decode('utf-8') # print(data) except: continue linkre = re.compile('href="(.+?)"') linkdata = linkre.findall(data) for next_url in linkdata: if 'http' in next_url and next_url not in visited: queue.append(next_url) visited.add(next_url)
第三步,伪装成火狐浏览器,并将爬取的页面存到磁盘上:
from collections import deque import re import urllib.request #存储爬到的网页 def saveToFile(filePath,data): with open(filePath,'w',encoding='utf-8') as fileop: fileop.write(data) queue = deque() visited = set() init_url = "http://www.badtom.cn" queue.append(init_url) visited.add(init_url) #通过头信息伪装成火狐浏览器 headinfo = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } filePath = 'E:/spider/' count = 0 while queue: url = queue.popleft() print('已经抓取:' + str(count) + '个,正在抓取-->' + url) count += 1 try: req = urllib.request.Request(url,headers = headinfo) urlop = urllib.request.urlopen(req,timeout = 2) data = urlop.read().decode('utf-8') saveToFile(filePath + str(count) + '.html', data) except: continue linkre = re.compile('href="(.+?)"') linkdata = linkre.findall(data) for next_url in linkdata: if 'http' in next_url and 'github' not in next_url and next_url not in visited: queue.append(next_url) visited.add(next_url)