网页api:https://movie.douban.com/top250?start=0&filter=
用到的模块:urllib,re,csv
捣鼓一上午终于好了,有些小问题
(top218有bug)具体问题:上图没有主演:用到正则表达式时取出过多的值,下图则是正常取值
所以取前200名,具体python代码实现如下,望大佬指导
#! /usr/bin/python3 # -*- coding:UTF-8 -*- from urllib import request import re,csv class MovieTopForDouBan(object): def __init__(self): self.start = 0 self.param = '&filter=' self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'} self.file_path = 'D:\\' self.head = ['排名','名称','别名','其他名称','导演','主演','年份','地区','类型','平均分','人数','短评'] self.movie_list=[] def get_page(self): try: url = 'https://movie.douban.com/top250?start=' + str(self.start) req = request.Request(url, headers=self.headers) response = request.urlopen(req) page = response.read().decode('utf-8') page_num = (self.start + 25) // 25 print('正在抓取第' + str(page_num) + '页数据...') self.start += 25 return page except request.URLError as e: if hasattr(e, 'reason'): print('抓取失败,失败原因:', e.reason) def get_movie_info(self): pattern = re.compile(u'<div.*?class="item">.*?<em class="">(.*?)</em>' u'.*?<span.*?class="title">(.*?)</span>' u'.*?<span.*?class="title">(.*?)</span>' u'.*?<span.*?class="other">(.*?)</span>' u'.*?<div.*?class="bd">.*?<p.*?class="">' u'.*?导演:(.*?) .*?主演: (.*?)<br>' u'(.*?) / (.*?) / (.*?)</p>.*?<div.*?class="star">' u'.*?<span.*?class="rating_num".*?property="v:average">(.*?)</span>' u'.*?<span>(.*?)人评价</span>.*?</div>' u'.*?<span.*?class="inq">(.*?)</span>.*?</p>', re.S) while self.start <= 176:#取前俩百 (top:218 电影名:初恋这件小事)有bug page=self.d=self.get_page() movies=re.findall(pattern,page) for movie in movies: data =list(movie) data[2] = data[2].lstrip(' / ') data[3] = data[3].lstrip(' / ') data[6] = data[6].lstrip() data[8] = data[8].rstrip() self.movie_list.append(data) def write_text(self): print('开始向文件写入数据....') with open(self.file_path+'movie_info.txt','w',encoding='utf-8') as file_TopText: try: for movie in self.movie_list: file_TopText.write('电影排名:' + movie[0] + '\r\n') file_TopText.write('电影名称:' + movie[1] + '\r\n') file_TopText.write('外文名称:' + movie[2] + '\r\n') file_TopText.write('电影别名:' + movie[3] + '\r\n') file_TopText.write('导演姓名:' + movie[4] + '\r\n') file_TopText.write('主演姓名:' + movie[5] + '\r\n') file_TopText.write('上映年份:' + movie[6] + '\r\n') file_TopText.write('制作国家/地区:' + movie[7] + '\r\n') file_TopText.write('电影类别:' + movie[8] + '\r\n') file_TopText.write('电影评分:' + movie[9] + '\r\n') file_TopText.write('参评人数:' + movie[10] + '\r\n') file_TopText.write('简短影评:' + movie[11] + '\r\n\r\n') print('抓取结果写入文件成功...') except Exception as e: print(e) print('数据写入完毕....') def write_csv_file(self): path = self.file_path + 'movie_info.csv' common=0 try: with open(path, 'w', newline='',encoding='utf-8') as csv_file: writer = csv.writer(csv_file, dialect='excel') if self.head is not None: writer.writerow(self.head) for row in self.movie_list: writer.writerow(row) common+=1 print("将CSV文件写入路径%s成功。" % path) except Exception as e: print("将CSV文件写入路径: %s, 信息: %s" % (path, e)) print(common) def main(self): print('开始从豆瓣电影抓取数据........') self.get_movie_info() self.write_text() #self.write_csv_file() print('数据抓取完毕...') if __name__ == '__main__': movie = MovieTopForDouBan() movie.main()
d盘根目录生成一个movie_info.txt 文件