'''
爬虫原理。
1.模拟浏览器的行为,通过网络请求将目标网页抓取到本地。
2.使用一定的匹配规则,将目标中需要的数据提取出来,把不需要的过滤掉。
3.根据需求,把提取出来的数据存储到磁盘中(json、csv、excel、数据库)。
需要安装的库:
requests(用来做网络请求的。就跟浏览器是一样的。)
bs4(用来将请求下来的数据进行解析的。)
lxml(这个库是用来解析html和xml格式数据的。)BeautifulSoup相当于只是一个壳,底层还是要基于lxml类似的这种解析器来解析。html5lib、html.parse安装方式安装lxml就行。
'''
import requests
from bs4 import BeautifulSoup
import json
def get_page():
# 1.url
url = "https://movie.douban.com/cinema/nowplaying/maanshan/"
# 2.请求页面的时候应该发送什么数据
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
# 3.GET/POST请求
# 采用的GET请求
# 4.发送请求
response = requests.get(url, headers=headers)
# print(response.text)
text = response.text
return text
def parse_page(text):
# 360/qq
soup = BeautifulSoup(text, 'lxml')
movies = []
liList = soup.find_all("li", attrs={"data-category": "nowplaying"})
for li in liList:
movie = {}
# print(li)
# print('=' * 30)
title = li['data-title']
# print(title)
score = li['data-score']
release = li['data-release']
duration = li['data-duration']
region = li['data-region']
director = li['data-director']
actors = li['data-actors']
img = li.find('img')
thumbnail = img['src']
# print(thumbnail)
movie['title'] = title
movie['score'] = score
movie['release'] = release
movie['duration'] = duration
movie['region'] = region
movie['director'] = director
movie['actors'] = actors
movie['thumbnail'] = thumbnail
movies.append(movie)
# print(movie)
return movies
def save_data(data):
with open('douban.json','w',encoding='utf-8') as fp:
# json.dump作用:将字典、列表dump成满足json格式的字符串
json.dump(data,fp,ensure_ascii=False)
if __name__ == '__main__':
text = get_page()
movies = parse_page(text)
save_data(movies)
爬取豆瓣电影数据(requests,基于lxml的BeautifulSoup,json)
猜你喜欢
转载自blog.csdn.net/luobofengl/article/details/104402077
今日推荐
周排行