版权声明:自学笔记,如有引用请标明博客,感谢 https://blog.csdn.net/feng_jlin/article/details/82221492
爬取电影名称、链接并写入文件
import urllib.request as urlrequest
from bs4 import BeautifulSoup
import time #休息时间
import random #为了时间随机
top250_url="https://movie.douban.com/top250?start={}&filter="
with open('C:/Users/feng_jlin/Desktop/douban_250.txt','w') as outputfile:
for i in range(10):
start = i*25
url_visit = top250_url.format(start)
crawl_content = urlrequest.urlopen(url_visit).read()
http_content = crawl_content.decode('utf8')
soup = BeautifulSoup(http_content,'html.parser')
all_item_divs = soup.find_all(class_='item')
for each_item_div in all_item_divs:
pic_div=each_item_div.find(class_='pic')
item_href=pic_div.find('a')['href']
item_name=pic_div.find('img')['alt']
outputfile.write('{} {}\n'.format(item_href,item_name))
print('{} {}\n'.format(item_href,item_name))
改进为爬去电影详细信息,报错输出错误原因
# -*- coding:utf-8 -*
import urllib.request as urlrequest
from bs4 import BeautifulSoup
import time #休息时间
import random #为了时间随机
import bs4 #注意点1:引入模块
#item_href 链接
#item_name 名称
#all_attrs_divs 主演
#movie_type_join 电影类型
#score_soup_divs 电影评分
top250_url = "https://movie.douban.com/top250?start={}&filter=" #top250的链接
movie_url = "https://movie.douban.com/subject/{}/" #movie进去详情页面
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
with open('C:/Users/feng_jlin/Desktop/douban_250.txt','w',encoding='utf8') as outputfile: #打开本地存储CSV文件
for i in range(10): #一共250个,一页25个,共10页,这个则是循环10页
start = i*25 #设置放在链接{}中的start
url_visit = top250_url.format(start)
req_url_visit = urlrequest.Request(url=url_visit, headers=headers)
crawl_content = urlrequest.urlopen(req_url_visit).read() #读取链接
http_content = crawl_content.decode('utf8') #因为有中文,把格式改为utf8
soup = BeautifulSoup(http_content,'html.parser') #用beautifulsoup解析网页
all_item_divs = soup.find_all(class_='item') #找到所有class=item,形成all_item_divs的列表
for each_item_div in all_item_divs: #进行列表循环
pic_div = each_item_div.find(class_='pic') #找到pic
item_href = pic_div.find('a')['href'] #找到a中的href链接
item_name=pic_div.find('img')['alt'] #找到电影名称
https, blank , web, subject, doubanID ,other = item_href.split('/') #分割/得到豆瓣ID
movie_url_visit = movie_url.format(doubanID) #movie详情页链接补充完整
try:
req_movie_url_visit = urlrequest.Request(url=movie_url_visit, headers=headers)
movie_crawl_content = urlrequest.urlopen(req_movie_url_visit).read() #读取链接
movie_http_content = movie_crawl_content.decode('utf8') #因为有中文,把格式改为utf8
movie_soup = BeautifulSoup(movie_http_content,'html.parser') #用beautifulsoup解析网页
#获取主演
all_actor_divs = movie_soup.find(class_='actor')
if isinstance(all_actor_divs,bs4.element.Tag) == True: #上面actor类别为空的话下一步会出错,所以用isinstance过滤空TAG,避免错误
all_attrs_divs = all_actor_divs.find(class_='attrs').get_text() #可以用split('/')分出列表,但本次不需要
else:
all_attrs_divs = "空"
type_soup_divs = movie_soup.find_all(property="v:genre") #获取电影类型
movie_type = [] #重置
for i in range(0,len(type_soup_divs)):
movie_type.append(type_soup_divs[i].get_text()) #获取get_text()文本,去除tag,放到一个新的列表中
movie_type_join = '/'.join(movie_type) #join连接列表中的元素
score_soup_divs = movie_soup.find(class_="ll rating_num").get_text() #获取电影评分
outputfile.write('{} {} {} {} {}\n'.format(item_href,item_name,movie_type_join,score_soup_divs,all_attrs_divs))
print('{} {} {} {} {}\n'.format(item_href,item_name,movie_type_join,score_soup_divs,all_attrs_divs))
time_interval = random .uniform(1,5) #随机1-5秒停止
time.sleep(time_interval) # wait some time, trying to avoid google forbidden (of crawler)
except urlrequest.HTTPError as err:
other_div = re.findall('主演: (.*?)<br/>.*?/.*?/.(.*?)\n.*?</p>',str(each_item_div),re.S) #正则抓取TOP250页面的类型和主演
other_score_div = re.findall('<span class="rating_num" property="v:average">(.*?)</span>',str(each_item_div),re.S) #正则抓取TOP250页面的电影评分
outputfile.write('{} {} {} {} {}\n'.format(item_href,item_name,movie_type_join,score_soup_divs,all_attrs_divs))
print('{} {} {} {} {}\n'.format(item_href,item_name,other_div[1],other_score_div[0],other_div[0]))
continue #防中间爬取的20页为空,判断若有错不跳出,继续
outputfile.close()
print('OK')
改进为爬去电影详细信息,报错只按顺序从TOP250页面爬取电影详情