python+request+lxml的几个例子

解析163新闻列表的例子：

Python代码

#!/usr/bin/python
# encoding=gbk
# 我只是尝试遍历新闻而已，只有有很多链接博客，主题之类的没有操作
# 如果要实现，就自己判断url来分析到底是什么，然后做相应的处理
import sys
import requests
import datetime
import time
import MySQLdb
import chardet
import lxml.html.soupparser as soupparser
import lxml.etree as etree
start_datetime = datetime.datetime.now()
def parseFromWin1252(str):
# 因为新闻有一些是乱码，编码是windows-1252，需要转换成GBK
#print len(tt.decode("ISO-8859-1").encode("windows-1252").decode("GBK"))
#print len(tt)
try:
return str.encode("windows-1252").decode("GBK")
except UnicodeEncodeError:
#print "UnicodeEncodeError"
return str
except UnicodeDecodeError:
#print "UnicodeDecodeError"
return str
def resolveAndSaveNewContentFromLink(link, linkTitle, cursor):
# 打开一个链接，并得到里面的内容
# 有两种情况无法得到，1.没有标题的，可能是一个主题的页面；2.报异常的，还没处理，所以无法拿到内容
print u"处理:", link
request = requests.get(link)
try:
dom = soupparser.fromstring(request.content)
body = dom[0]
titles = body.xpath("//h1[@id='h1title']")
if len(titles) > 0:
#有标题
title = parseFromWin1252(titles[0].text)
print u"@TITLE:", request.encoding, title, link
newContents = body.xpath("//div[@id='endText']//p")
alist = []
for content in newContents:
if content.text != None:
alist.append(content.text)
text = parseFromWin1252("<br><br>".join(alist))
values = [link, title, text, "Success"]
cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)
else:
#无标题
title = parseFromWin1252(linkTitle)
print u"#NO_TITLE:", request.encoding, title, link
values = [link, title, "", "NO_TITLE"]
cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)
except TypeError:
#报异常
title = parseFromWin1252(linkTitle)
print u"$TypeError:", request.encoding, title, link
values = [link, title, "", "TypeError"]
cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)
#定义方法
def resolveAndSaveLinks(body, cursor):
print u"解析html的Link"
links = body.xpath("//ul[@class='mod-list main-list']//a")
print u"处理数据"
count = 1;
for item in links:
# 有em标签的无法解析
if item.text != None:
values = [item.get("href"), item.text]
cursor.execute("insert into links(url,text) value(%s,%s)", values)
resolveAndSaveNewContentFromLink(item.get("href"), item.text, cursor)
#time.sleep(100) #是否需要暂停，免得被封掉?
print u"完成","<resolveAndSaveLinks>[%s:%s]" %(len(links), count)
count = count + 1
print "----------------------------------------------------------"
print u"保存数据完成,记录数[", len(links), "]"
def resolveAndSaveEmInLinks(body, cursor):
print u"解析html的包含em元素的Link"
ems = body.xpath("//ul[@class='mod-list main-list']//em")
print u"处理数据"
count = 1;
for item in ems:
values = [item.getparent().get("href"), item.text]
cursor.execute("insert into links(url,text) value(%s,%s)", values)
resolveAndSaveNewContentFromLink(item.getparent().get("href"), item.text, cursor)
#time.sleep(100) #是否需要暂停，免得被封掉?
print u"完成","<resolveAndSaveEmInLinks>[%s:%s]" %(len(ems), count)
count = count + 1
print "----------------------------------------------------------"
print u"保存数据完成,记录数[", len(ems), "]"
def resolve():
print u"打开链接"
req = requests.get("http://news.163.com/")
content = req.content
dom = soupparser.fromstring(content)
body = dom[1]
print u"链接数据库"
conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")
cursor = conn.cursor()
cursor.execute("delete from links")
cursor.execute("delete from texts")
#resolveAndSaveNewContentFromLink("http://auto.163.com/13/0929/02/99TGSGRJ00084TUR.html", u"测试", cursor)
#if True:
# return
print u"解析并保存到数据库"
#遍历不包含em标签的link
resolveAndSaveLinks(body, cursor)
#遍历包含em标签的link
resolveAndSaveEmInLinks(body, cursor)
cursor.close()
conn.close()
print u"遍历完成"
#开始调用
resolve()
end_datetime = datetime.datetime.now()
print u"耗时", (end_datetime - start_datetime).seconds, u"秒"

遍历糗事百科的文章，只遍历导航上面的几个分类，热门，最新，等等

Python代码

#!/usr/bin/ScanningQiuShiBaiKe.py
# encoding=gbk
import sys
import os
import MySQLdb
import requests
import datetime
import time
import lxml.html.soupparser as soupparser
import lxml.etree as etree
currentPageId = "currentPageId"
def getImageFile(imgUrl): #文件下载，并写入本地硬盘，返回文件名
local_filename = imgUrl.split('/')[-1]
local_filename= "/home/pandy/tmp/"+local_filename
print u"下载文件成功: ", local_filename
r = requests.get(imgUrl, stream=True) # here we need to set stream = True parameter
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
f.close()
return local_filename
return None
def scannintArticle(cursor, type, url, article): #处理一个主题的信息
articleStr = etree.tostring(article)
articleBody = soupparser.fromstring(articleStr)
details = articleBody.xpath("//div[@class='detail']")
authors = articleBody.xpath("//div[@class='author']")
contents = articleBody.xpath("//div[@class='content']")
thumbs = articleBody.xpath("//div[@class='thumb']")
values = [type, url]
if len(details) > 0:
detailStr = etree.tostring(details[0])
detail = soupparser.fromstring(detailStr)
values.append(detail.xpath("//a")[0].text)
values.append(detail.xpath("//a")[0].get("href"))
else:
values.append("")
values.append("")
if len(authors) > 0:
authorStr = etree.tostring(authors[0])
author = soupparser.fromstring(authorStr)
values.append(author.xpath("//a")[0].text)
values.append(author.xpath("//a")[0].get("href"))
else:
values.append("")
values.append("")
if len(contents) > 0:
contentStr = etree.tostring(contents[0])
values.append(contents[0].text)
else:
values.append("")
values.append("")
if len(thumbs) > 0:
thumbStr = etree.tostring(thumbs[0])
thumb = soupparser.fromstring(thumbStr)
imgUrl = thumb.xpath("//img")[0].get("src")
values.append(imgUrl)
#下载图片，先临时存放，然后在读取出来保存到数据库，并删除
local_filename = getImageFile(imgUrl)
f = open( local_filename , "rb" )
b = f.read()
f.close()
os.remove(local_filename)
values.append(MySQLdb.Binary(b))
else:
values.append("")
values.append(None)
values.append("Success")
print values
cursor.execute(
"INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content,status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
values)
def scanning4typeArticle(cursor, type, url): #扫描一页
request = requests.get(url)
#print request.encoding
print url
#print len(request.content)
#print request.content
try:
dom = soupparser.fromstring(request.content)
body = dom[1]
#查找一页下面的主题
articleList = body.xpath("//div[@class='block untagged mb15 bs2']")
for article in articleList:
scannintArticle(cursor, type, url, article)
except:
print "Error"
values = [type, url, '', '', '', '', '', '',None, "Error"]
cursor.execute(
"INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content, status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
values)
def scanning4type(cursor, type, url, subfix): #得到分页数，然后一页一页的打开
print u"开始扫描文章"
request = requests.get(url);
dom = soupparser.fromstring(request.content)
body = dom[0]
#得到底部分页的最大值
pagebars = body.xpath("//div[@class='pagebar']/a")
if len(pagebars) > 2:
maxPageSize = int(pagebars[len(pagebars) - 2].text) + 1
#一页一页的打开
for i in range(1, maxPageSize):
scanningUrl = "".join([url, subfix]).replace(currentPageId, str(i))
scanning4typeArticle(cursor, type, scanningUrl)
print u"扫描文章完成"
def main(): # 主方法
#打开数据库
conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")
cursor = conn.cursor()
cursor.execute("delete from qs_article")
#扫描几个类型，就是导航的前几个分类
scanning4type(cursor, "8HR", "http://www.qiushibaike.com/8hr", "".join(["/page/", "currentPageId", "?s=4602020"]))
#scanning4type(cursor, "HOT", "http://www.qiushibaike.com/hot", "".join(["/page/", "currentPageId", "?s=4602057"]))
#scanning4type(cursor, "IMGRANK", "http://www.qiushibaike.com/imgrank", "".join(["/page/", "currentPageId", "?s=4602057"]))
#scanning4type(cursor, "LATE", "http://www.qiushibaike.com/late", "".join(["/page/", "currentPageId", "?s=4602057"]))
#scanning4typeArticle(cursor, type, "http://www.qiushibaike.com/late/page/346?s=4602057")
#关闭数据库
cursor.close()
conn.close()
#开始运行主程序
main()

遍历新浪一些博客的图片,加入了访问频率控制

Python代码

#!/usr/bin/python
# encoding=gbk
#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=
import sys
import os
import requests
import MySQLdb
import lxml.html.soupparser as soupparser
import lxml.etree as etree
import json
import time
maxPage = 100 # 定义被扫描的最大页数
requests.adapters.DEFAULT_RETRIES = 5
#加入控制打开频率
DEFAULT_OPEN_PAGE_FREQUENCY = 1 #打开页面的间隔事件
DEFAULT_OPEN_IMAGE_FREQUENCY = 3 #打开图片页面的间隔事件
DEFAULT_IMAGE_COUNT = 0 #图片计数器
DEFAULT_IMAGE_SIZE = 20 #打开size张图片后，要sleep DEFAULT_OPEN_IMAGE_FREQUENCY秒钟
def saveImage(title, imageSrc): # 保存图片
if title == None:
title = u"无题"
print u"标题:%s 图片:%s" % (title, imageSrc)
dirStr = u"/mnt/E/新浪图集/" + title + "/"
if not os.path.exists(dirStr):
os.makedirs(dirStr)
fileName = imageSrc.split('/')[-1]
request = requests.get(imageSrc, stream=True)
with open(dirStr + fileName, "wb") as file:
for chunk in request.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks 5
file.write(chunk)
file.flush()
file.close()
def listPicPage(pageUrl): #从首页打开链接，然后进行图片的页面
global DEFAULT_IMAGE_COUNT
request = requests.get(pageUrl)
dom = soupparser.fromstring(request.content)
body = dom[1]
title = body.xpath("//h3[@class='title']")
titleStr = "";
if len(title) > 0:
titleStr = title[0].text
imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")
print u"遍历图片页面，标题:%s, 地址: %s " % (titleStr, pageUrl)
imageSrc = None
for image in imageList:
# 这里好像有两个地址，先用real_src，否在用src
if image.get("real_src") != None:
imageSrc = image.get("real_src")
else:
imageSrc = image.get("src")
#要存在图片地址，才需要继续解析
if imageSrc != None:
saveImage(titleStr, imageSrc)
#访问频率控制
DEFAULT_IMAGE_COUNT = DEFAULT_IMAGE_COUNT + 1
if DEFAULT_IMAGE_COUNT % DEFAULT_IMAGE_SIZE == 0:
print u"图片计数:%s, 休息 %s 秒钟后继续\n" % (DEFAULT_IMAGE_COUNT, DEFAULT_OPEN_IMAGE_FREQUENCY)
time.sleep(DEFAULT_OPEN_IMAGE_FREQUENCY)
def listPicIndex(): #遍历首页
# 根据页数来打开url
for i in range(1, maxPage + 1):
url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(
i) + "&type=2&blogid=67f899b332002zdw&ch="
request = requests.get(url)
json_obj = json.loads(request.content)
for item in json_obj["data"]["list"]:
#找到这一页的所有图片链接，然后进行打开这个链接，才是显示图片的页面
dom = soupparser.fromstring(item)
link = dom.xpath("//a[@class='pic']")
if len(link) > 0:
#遍历图片的页面
listPicPage(link[0].get("href"))
print u"---------------------------------------------完成一个图片链接, 页数:", i
#访问频率控制
# time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)
print u"---------------------------------------------完成页数", maxPage, ":", i
def main():
listPicIndex()
#listPicPage("http://qing.blog.sina.com.cn/tj/a1509eee330044am.html")
if __name__ == "__main__":
main()

上面的例子改成多线程

Python代码

#!/usr/bin/python
# encoding=gbk
#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=
import sys
import os
import requests
import MySQLdb
import lxml.html.soupparser as soupparser
import lxml.etree as etree
import json
import time
import threading
MAX_PAGE = 100 # 定义被扫描的最大页数
MAX_ERROR = 10 # 定义线程允许出现的最大错误数，当不超过这个数字的时候，会自动继续重试
PAGE_SIZE = 5 #段数
DEFAULT_OPEN_PAGE_FREQUENCY = 2 #完成一页休眠的时间
DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY = 5 #出现异常之后等待重试的事件
requests.adapters.DEFAULT_RETRIES = 5
def saveImage(thName, title, imageSrc, currentPath): # 保存图片
if title == None:
title = u"无题"
print u"线程名称:%s, 页码:%s, 标题:%s 图片:%s" % (thName, currentPath, title, imageSrc)
dirStr = u"/mnt/E/新浪图集/" + title + "/"
if not os.path.exists(dirStr):
os.makedirs(dirStr)
fileName = imageSrc.split('/')[-1]
request = requests.get(imageSrc, stream=True)
with open(dirStr + fileName, "wb") as file:
for chunk in request.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks 5
file.write(chunk)
file.flush()
file.close()
def listPicPage(thName, pageUrl, currentPath): #从首页打开链接，然后进行图片的页面
global DEFAULT_IMAGE_COUNT
request = requests.get(pageUrl)
dom = soupparser.fromstring(request.content)
body = dom[1]
title = body.xpath("//h3[@class='title']")
titleStr = "";
if len(title) > 0:
titleStr = title[0].text
imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")
#print u"\n\n页码:%s, 遍历图片页面，标题:%s, 地址: %s " % (currentPath, titleStr, pageUrl)
imageSrc = None
for image in imageList:
# 这里好像有两个地址，先用real_src，否在用src
if image.get("real_src") != None:
imageSrc = image.get("real_src")
else:
imageSrc = image.get("src")
#要存在图片地址，才需要继续解析
if imageSrc != None:
saveImage(thName, titleStr, imageSrc, currentPath)
def listPicIndex(thName, startPath, endPath): #遍历首页
# 根据页数来打开url
for i in range(startPath, endPath + 1):
url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(
i) + "&type=2&blogid=67f899b332002zdw&ch="
print url
request = requests.get(url)
json_obj = json.loads(request.content)
error_count = 0
for item in json_obj["data"]["list"]:
#找到这一页的所有图片链接，然后进行打开这个链接，才是显示图片的页面
dom = soupparser.fromstring(item)
link = dom.xpath("//a[@class='pic']")
if len(link) > 0:
#遍历图片的页面
try:
listPicPage(thName, link[0].get("href"), i)
except:
if error_count < MAX_ERROR:
error_count = error_count + 1
#错先错误的话，等待一会儿，再重试
print u"---------------------------------------------休眠%s秒钟后重试, 页数:%s" % (
DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY, i)
time.sleep(DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY)
listPicPage(thName, link[0].get("href"), i)
else:
print u"出错超过预设次数，退出爬虫。"
#print u"---------------------------------------------完成一个图片链接, 页数:", i
#访问频率控制
time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)
print u"---------------------------------------------完成页数", MAX_PAGE, ":", i
return True
class MyThread(threading.Thread):
def __init__(self, name, startPath, endPage):
threading.Thread.__init__(self)
self.name = name
self.is_stop = False
self.startPage = startPath
self.endPage = endPage
def run(self):
while not self.is_stop:
#遍历完成后停止线程
self.is_stop = listPicIndex(self.name, self.startPage, self.endPage)
def stop(self): #手动设置停止标记
self.is_stop = True
if __name__ == "__main__":
#分段创建线程
count=1;
for i in range(1, MAX_PAGE, PAGE_SIZE):
startPath = i
endPath = i + PAGE_SIZE
if endPath > MAX_PAGE:
endPath = MAX_PAGE
print startPath, ",", endPath
t = MyThread("Thread " + str(count), startPath, endPath)
count=count+1
t.start()
pass

python+request+lxml的几个例子

猜你喜欢