输入起始页的灵活爬取
# - * - coding: UTF-8 - * -
"""
import urllib2
url = "http://www.baidu.com"
#IE 9.0 的 User-Agent,包含在 ua_header里
ua_header = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
# url 连同 headers,一起构造Request请求,这个请求将附带 IE9.0 浏览器的User-Agent
request = urllib2.Request(url, headers = ua_header)
# 向服务器发送这个请求
response = urllib2.urlopen(request)
html = response.read()
print html
import urllib #负责url编码处理
import urllib2
url = "http://www.baidu.com/s"
word = {"wd":"华育兴业"}
word = urllib.urlencode(word) #转换成url编码格式(字符串)
newurl = url + "?" + word # url首个分隔符就是 ?
headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
request = urllib2.Request(newurl, headers=headers)
response = urllib2.urlopen(request)
print response.read()
"""
# 爬虫贴吧例子
import sys
import urllib
import urllib2
if sys.getdefaultencoding() != 'utf-8':
reload(sys)
sys.setdefaultencoding('utf-8')
def writeFile(html, filename):
"""
作用:保存服务器响应文件到本地磁盘文件里
html: 服务器响应文件
filename: 本地磁盘文件名
"""
# print "正在存储" + filename
# Fs=open(filename, 'w+')
# Fs.write(html)
# Fs.close()
with open("d://123/"+filename.encode('gb2312'), 'w') as f:
f.write(html)
print "-" * 20
def tiebaSpider(url, beginPage, endPage):
"""
作用:负责处理url,分配每个url去发送请求
url:需要处理的第一个url
beginPage: 爬虫执行的起始页面
endPage: 爬虫执行的截止页面
"""
for page in range(beginPage, endPage + 1):
pn = (page - 1) * 50
filename = "第"+str(page) + "页.html"
# 组合为完整的 url,并且pn值每次增加50
fullurl = url + "&pn=" + str(pn)
#print fullurl
# 调用loadPage()发送请求获取HTML页面
html = loadPage(fullurl, filename)
# 将获取到的HTML页面写入本地磁盘文件
writeFile(html, filename)
def loadPage(url, filename):
'''
作用:根据url发送请求,获取服务器响应文件
url:需要爬取的url地址
filename: 文件名
'''
print "正在下载" + filename
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
return response.read()
# 模拟 main 函数
if __name__ == "__main__":
kw = raw_input("请输入需要爬取的贴吧:")
# 输入起始页和终止页,str转成int类型
beginPage = int(raw_input("请输入起始页:"))
endPage = int(raw_input("请输入终止页:"))
url = "http://tieba.baidu.com/f?"
key = urllib.urlencode({"kw": kw})
# 组合后的url示例:http://tieba.baidu.com/f?kw=lol
url = url + key
tiebaSpider(url, beginPage, endPage)
2.
# - * - coding: UTF-8 - * -
import sys
import urllib
import urllib2
from lxml import etree
if sys.getdefaultencoding() != 'utf-8':
reload(sys)
sys.setdefaultencoding('utf-8')
class Spider:
def __init__(self):
self.beginPage = int(raw_input("请输入起始页:"))
self.endPage = int(raw_input("请输入终止页:"))
self.url = 'http://duanziwang.com/category/%E6%90%9E%E7%AC%91%E5%9B%BE/'
self.ua_header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
#self.userName = 1
def tiebaSpider(self):
for page in range(self.beginPage, self.endPage + 1):
myUrl = self.url + str(page) + '/'
links = self.loadImages(myUrl)
def loadImages(self, link):
req = urllib2.Request(link, headers=self.ua_header)
html = urllib2.urlopen(req).read()
selector = etree.HTML(html)
imagesLinks = selector.xpath('//div/p/img/@src')
imagesNames = selector.xpath('//div/p/img/@title')
# 依次取出图片路径,下载保存
for (imageslink, imagesname)in zip (imagesLinks, imagesNames):
self.writeImages(imageslink,imagesname)
def writeImages(self, imagesLink,imagesName):
'''
将 images 里的二进制内容存入到 userNname 文件中
'''
print "正在存储文件 %s ..." % imagesName
# 1. 打开文件,返回一个文件对象
with open("d:/124/"+imagesName, "wb") as file:
# 2. 获取图片里的内容
image = urllib2.urlopen(imagesLink).read()
# 3. 调用文件对象write() 方法,将page_html的内容写入到文件里
file.write(image)
file.close()
# 模拟 main 函数
if __name__ == "__main__":
# 首先创建爬虫对象
mySpider = Spider()
# 调用爬虫对象的方法,开始工作
mySpider.tiebaSpider()
3.爬取段子网
# -*- coding: utf-8 -*-
import urllib2
import re
class Spider:
"""
段子网
"""
def loadPage(self, page):
"""
@brief 定义一个url请求网页的方法
@param page 需要请求的第几页
@returns 返回的页面html
"""
url = "http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/" + str(page)+ "/"
#User-Agent头
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
headers = {'User-Agent': user_agent}
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
html = response.read()
pattern = re.compile(r'(<p>|</p>)')
html = pattern.sub("",html)
pattern = re.compile(r'<br>')
html = pattern.sub("\\n", html)
pattern = re.compile(r'<div class="post-content">(.*?)</div>', re.S)
item_list = pattern.findall(html)
return item_list
def printOnePage(self, item_list, page):
"""
@brief 处理得到的段子列表
@param item_list 得到的段子列表
@param page 处理第几页
"""
print item_list
print "******* 第 %d 页 爬取完毕...*******" % page
for item in item_list:
self.writeToFile(item)
def writeToFile(self, text):
'''
@brief 将数据追加写进文件中
@param text 文件内容
'''
with open("d:/124/duanzi.txt", 'a') as myFile:
myFile.write(text)
myFile.write("\r\n-----------------------------------------------------")
myFile.close()
def doWork(self):
'''
让爬虫开始工作
'''
while self.enable:
try:
item_list = self.loadPage(self.page)
except urllib2.URLError, e:
print e.reason
continue
# 对得到的段子item_list处理
self.printOnePage(item_list, self.page)
self.page += 1 # 此页处理完毕,处理下一页
print "按回车继续..."
print "输入 quit 退出"
command = raw_input()
if (command == "quit"):
break
if __name__ == '__main__':
"""
======================
段子网小爬虫
======================
"""
#定义一个Spider对象
mySpider = Spider()
mySpider.page = 1
mySpider.enable = True
mySpider.doWork()