技术路线:requests-xpath
- 使用requests获取网页内容
使用try...except获取网页内容
- 使用xpath解析网页
对主要信息使用xpath进行提取
-翻页及反爬处理
猫眼的翻页处理是url处进行翻页的
http://maoyan.com/board/4?offset=0
http://maoyan.com/board/4?offset=10
所以可以拼接处理url
猫眼的反爬主要是根据user-agent判断,所以访问的时候需要指定headers
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import requests
from lxml import etree
def getHtml(url, headers):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
print('ERROR')
def fillList(html):
html = etree.HTML(html)
img = html.xpath('//*[@id="app"]/div/div/div/dl/dd/a/img[2]/@data-src') # 这里存在坑,使用Chrome查看元素时,图片属性是src,实际使用requests获取的时候,图片属性是data-src
title = html.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()')
star = html.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()')
releasetime = html.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()')
for i in range(len(title)):
temp = {
'img': img[i][:-16], # 提取图片的时候存在图片大小设定等,使用切片去除
'title': title[i],
'star': star[i].strip(), # 提取主演信息的时候有很多空格跟换行符,使用.strip()函数去除
'releasetime': releasetime[i],
}
print(temp)
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
deep = 10
for i in range(deep):
url = 'http://maoyan.com/board/4?offset='+str(i*10)
html = getHtml(url, headers=headers)
print("正在打印第%s页" % (i+1))
fillList(html)
if __name__ == '__main__':
main()