lxml库用于在html字符串中使用xpath语言提取标签属性和内容等。
1 安装
pip install lxml
2 使用
from lxml import etree
ele=etree.HTML("html字符串")
result=ele.xpath("xpath语法串")
对于list形式的元素,有两种提取方式
- 每次提取某个属性的所有值
- 先得到list,再每次获取单个对象的所有属性值。此时需要用./表名路径
3 示例
爬取豆瓣电影排行榜
import requests
import json
from lxml import etree
# 一次性获取所有电影的信息
def xpath_html1():
url='https://movie.douban.com/chart'
header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
html=requests.get(url,headers=header)
print(html.text)
# 使用etree获取element
ele=etree.HTML(html.text)
# 获取电影名
movie_name=[i.replace(' ','') for i in ele.xpath("//div[@class='indent']/div/table//div[@class='pl2']/a/text()")]
print(movie_name)
# 获取评分
movie_rating=ele.xpath("//div[@class='indent']/div/table//div[@class='pl2']/div/span[@class='rating_nums']/text()")
print(movie_rating)
# 获取上映时间
movie_date = ele.xpath("//div[@class='indent']/div/table//div[@class='pl2']/p/text()")
print(movie_date)
# 获取电影封面
movie_poster=ele.xpath("//div[@class='indent']/div/table//a[@class='nbg']/img/@src")
print(movie_poster)
# 先得到电影list,再每次解析其中的每个电影详情
def xpath_html2():
url = 'https://movie.douban.com/chart'
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
html = requests.get(url, headers=header)
print(html.text)
# 使用etree获取element
ele = etree.HTML(html.text)
# 获取电影list
list=ele.xpath("//div[@class='indent']/div/table")
movie_list=[]
for movie in list:
item={}
name = movie.xpath(".//div[@class='pl2']/a/text()")
item['name'] = name[0].replace("/","").replace("\n","").strip()
rating=movie.xpath(".//div[@class='pl2']/div/span[@class='rating_nums']/text()")
item['rating'] = rating[0]
poster=movie.xpath(".//a[@class='nbg']/img/@src")
item['poster']=poster[0]
movie_list.append(item)
print(json.dumps(movie_list,ensure_ascii=False))#转为json字符串,可以直接输出到json文件中
if __name__=='__main__':
# xpath_html1()
xpath_html2()
Notice
应博友的要求,创建了一个QQ群,方便大家学习交流,群内也会经常分享一下学习资源。有兴趣的小伙伴可以加群哦!