一、只爬取第一页
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.141 Safari/537.36'}
url = 'https://ssr1.scrape.center/'
html = requests.get(url,headers=headers,verify=False)
soup = BeautifulSoup(html.content,'lxml')
url_list = soup.find_all(class_='name')
title_list = soup.find_all(class_='m-b-sm')
theme_list = soup.find_all(class_='categories')
score_list = soup.find_all(class_='score m-t-md m-b-n-sm')
url, title, theme, score = [], [], [], []
for x,y,z,i in zip(url_list,title_list,theme_list,score_list):
url.append('https://ssr1.scrape.center'+x['href'])
title.append(y.text)
theme.append(z.text.replace('\n', '').replace('\r', ''))
score.append(i.text.strip())
df = {
'链接':url,
'标题':title,
'主题':theme,
'评分':score
}
work1 = pd.DataFrame(df)
work1.to_csv('work1.csv')
最终爬取csv文件显示效果如下所示:
二、爬取全部电影信息
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import urllib3
urllib3.disable_warnings()
#因为网页无ssl证书,会显示警告信息,此行去除警告信息
url, title, theme, score = [], [], [], []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.141 Safari/537.36'}
global url_list,title_list,theme_list,score_list
for i in range(1,11):
the_url = 'https://ssr1.scrape.center/page/' + str(i)
html = requests.get(the_url,headers=headers,verify=False)
#需将上面的url改为the_url,不然会在第22行报错,22行会将url默认为此处的url局部变量,而不是全局变量
soup = BeautifulSoup(html.content,'lxml')
url_list = soup.find_all(class_='name')
title_list = soup.find_all(class_='m-b-sm')
theme_list = soup.find_all(class_='categories')
score_list = soup.find_all(class_='score m-t-md m-b-n-sm')
for x,y,z,i in zip(url_list,title_list,theme_list,score_list):
url.append('https://ssr1.scrape.center'+x['href'])
title.append(y.text)
theme.append(z.text.replace('\n', '').replace('\r', ''))
score.append(i.text.strip())
df = {
'链接':url,
'标题':title,
'主题':theme,
'评分':score
}
work1 = pd.DataFrame(df)
work1.to_csv('work1.csv')
爬取csv文件展示