今天为了在磨练我的爬虫技术,并且顺便复习一下requests和lxml库,写了一个小小的爬虫--------爬取电影天堂中最新电影前10页的内容,说实话,经历了很多坎坷,才爬取成功的,每写一段代都必须检验爬取的结果是否正确,不过,最终完成了这个小小的任务。
第一步:
第二步:
第三步:
话不多说,直接上代码!
import requests
from lxml import etree
#网页的基本URL
BASE_URL = "https://www.dytt8.net"
#存储电影的各类信息
MOVIES = []
#存储每个电影的URL
MOVIE_URLS = []
#网页的请求头信息
HEADERS = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400"
}
#获取网页的源代码
def getHtml(url):
try:
response = requests.get(url,headers = HEADERS)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
"爬取失败"
#获取一页的电影的url地址
def get_movie_urls(page):
html = etree.HTML(page)
urls = html.xpath("//div[@class = 'co_content8']//ul//table//a/@href")
MOVIE_URLS.extend(list(map(lambda url:BASE_URL+url,urls)))
#根据每一个电影的url地址来获得详细的电影信息
def get_movie_detail(url):
movie = {}
page = getHtml(url)
html = etree.HTML(page)
#获取电影名称
title = html.xpath("//h1/font[@color = '#07519a']/text()")[0]
movie["title"] = title
#获取电影的海报
poster = html.xpath("//div[@id = 'Zoom']//img/@src")[0]
movie["poster"] = poster
#获取全部的信息
content = html.xpath("//div[@id = 'Zoom']//p//text()")
#获取各项信息
for index,info in enumerate(content):
if info.startswith("◎译 名"):
translate_name = info.replace("◎译 名","").strip()
movie["translate_name"] = translate_name
elif info.startswith("◎片 名"):
movie_name = info.replace("◎片 名", "").strip()
movie["translate_name"] = movie_name
elif info.startswith("◎年 代"):
year = info.replace("◎年 代", "").strip()
movie["translate_name"] = year
elif info.startswith("◎产 地"):
country = info.replace("◎产 地","").strip()
movie["country"] = country
elif info.startswith("◎类 别"):
typ = info.replace("◎类 别", "").strip()
movie["typ"] = typ
elif info.startswith("◎语 言"):
language = info.replace("◎语 言", "").strip()
movie["language"] = language
elif info.startswith("◎字 幕"):
zimu = info.replace("◎字 幕", "").strip()
movie["zimu"] = zimu
elif info.startswith("◎上映日期"):
data = info.replace("◎上映日期", "").strip()
movie["data"] = data
elif info.startswith("◎豆瓣评分"):
douban = info.replace("◎豆瓣评分", "").strip()
movie["douban"] = douban
elif info.startswith("◎片 长"):
time = info.replace("◎片 长", "").strip()
movie["time"] = time
elif info.startswith("◎导 演"):
diretion = info.replace("◎导 演", "").strip()
movie["diretion"] = diretion
elif info.startswith("◎主 演"):
actors = []
actor = info.replace("◎主 演", "").strip()
actors.append(actor)
for name in range(index+1,len(content)):
if content[name].startswith("◎标 签") or content[name].startswith("◎简 介"):
break
actors.append(content[name].strip())
movie["actors"] = actors
elif info.startswith("◎标 签"):
label = info.replace("◎标 签", "").strip()
movie["label"] = label
elif info.startswith("◎简 介 "):
unuse = info.replace("◎标 签", "").strip()
for introdu in range(index+1,len(content)):
if content[introdu].startswith("【下载地址】"):
break
movie['introdu'] = content[introdu].strip()
break
MOVIES.append(movie)
if __name__ == '__main__':
#这个for循环获取每一页中电影的URL地址
for i in range(1,11):
url = "https://www.dytt8.net/html/gndy/dyzz/list_23_"+str(i)+".html"
page = getHtml(url)
get_movie_urls(page)
#这个for循环获取每一个电影的详细信息并输出
for index,url in enumerate(MOVIE_URLS):
print("---------------正在爬取第{}个电影-----------------".format(index+1))
#获取每部电影的详细信息并输出
get_movie_detail(url)
print(MOVIES[index])
print("--------------------------------------------------")
欢迎各位志同道合的小伙伴评论哦!!