import re import requests def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} try: r = requests.get(url, headers=headers) r.raise_for_status() return r.text except: print('status_code is not 200') return None def parse_html(text): score_list = [] movie_list = re.findall(r'<a href=.* title="(.*?)".*</a>', text) rank_list = re.findall(r'<i class="board-index.*">(\d+)</i>', text) star_list = re.findall(r'<p class="star">.*?主演:(.*?)\s*</p>', text, re.S) time_list = re.findall(r'<p class="releasetime">上映时间:(\d{4}-\d{2}-\d{2}).*?</p>', text) score_1_list = re.findall(r'<i class="integer">(.*?)</i>', text) score_2_list = re.findall(r'<i class="fraction">(.*?)</i>', text) for score_1, score_2 in zip(score_1_list, score_2_list): score_list.append(score_1 + score_2) for rank, movie, star, timestamp, scroe in zip(rank_list, movie_list, star_list, time_list, score_list): print('排名:', rank, '电影名称:', movie, '主演:', star, '上映时间:', timestamp, '评分:', scroe) if __name__ == '__main__': url = 'http://maoyan.com/board/4' for i in range(10): path = url + '?offset=' + str(i*10) txt = get_html(path) if txt: parse_html(txt)
猫眼电影TOP100榜
猜你喜欢
转载自www.cnblogs.com/jp-mao/p/10005268.html
今日推荐
周排行