1.xpath解析的使用
抓取豆瓣图书案例
#豆瓣读书抓取青春分类数据
#coding=utf-8
import requests
from lxml import etree
import time
with open('F:\pythondoc\douban.txt','w',encoding='utf-8') as f:
for m in range(25):
url="https://book.douban.com/tag/%E9%9D%92%E6%98%A5?start={}".format(m*20)
data=requests.get(url).text
time.sleep(1)
a=etree.HTML(data)
title=a.xpath('//*[@id="subject_list"]/ul/li/div[2]/h2/a/text()')
informotion= a.xpath('//*[@id="subject_list"]/ul/li/div[2]/div[1]/text()')
star=a.xpath('//*[@id="subject_list"]/ul/li/div[2]/div[2]/span[2]/text()')
analyze=a.xpath('//*[@id="subject_list"]/ul/li/div[2]/p/text()')
img=a.xpath('//*[@id="subject_list"]/ul/li/div[1]/a/img/@src')
for i in range(19) :
f.write('{} {} {}分 {} {}' .format(title[i],informotion[i],star[i],analyze[i],img[i]))
2.re正则表达式的使用
抓取猫眼电影案例
import requests
import re
head={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
with open('F:\pythondoc\maoyan.txt','w',encoding='utf-8') as f:
for m in range(10):
url="http://maoyan.com/board/4?offset={}".format(m*10)
a=requests.get(url,headers=head).text
#print (a)
b=re.compile('<dd>.*?board-index.*?">(.*?)</i>.*?data-src="(.*?)".*?data-val.*?>(.*?)</a></p>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i></p>.*?</dd>',re.S).findall(a)
#print (b)
#print (b[0])
#print (b[0][0])
# for c in b:
# d={
# 'nou':'b[0][0]',
# 'img':'c[1]',
# 'name':'c[2]',
# 'acter':'c[3]strip()'
# }
#
# print (d)
for i in range(10):
print('{} {} {} {} {} {}'.format(b[i][0],b[i][1],b[i][2],b[i][3],b[i][4],b[i][5]+b[i][6]))
# print('{} {}'.format(b[i][0], b[i][1]))
f.write('{} {} {} {} {} {}'.format(b[i][0],b[i][1],b[i][2],b[i][3],b[i][4],b[i][5]+b[i][6])+'\n')
3.with open储存图片音频视频案例
import requests
import re
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url='http://video.pearvideo.com/mp4/adshort/20180724/cont-1395816-12521564_adpkg-ad_hd.mp4'
response=requests.get(url,headers=headers)
jpg=response.content
print (jpg)
with open('F:\pythondoc\liuxin.mp4','wb') as f:
f.write(jpg)
f.close()
# c=open('F:\pythondoc\liuxin.jpg','rb')
# c.read()
4.mongodb数据库的使用方法
import pymongo
client=pymongo.MongoClient('localhost',27017)
database_name=client['database']
table_name=database_name['table']
dict={"name":"jack","sex":"male","job":"docter"}
table_name.insert(dict)