python数据分析实战一：IMDB Top 250

Top 250 Movies in IMDB

This article deals with the top 250 movies in IMDB, including data scraping, data preparation, data cleaning, data analysis and visualization.

Data scraping

First, we need to scrape the data from this website.

# import package
import pandas as pd
import time
import urllib.request
from lxml.html import fromstring
from bs4 import BeautifulSoup

# download html
def download(url):
    print('Downloading:', url)
    request = urllib.request.Request(url)
    request.add_header('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36') #进行伪装
    resp = urllib.request.urlopen(request)
    html = resp.read().decode('utf-8')
    return html


# content to be scrape
Name = []
Year = []
Rate = []
Level = []
Directors = []
Writers = []
Stars = []
Genres = []
Runtime = []
Country = []
Language = []
Budget = []
Box_Office_USA = []
Box_Office_World = []

start_url = download('https://www.imdb.com/chart/top/?ref_=nv_mv_250')
domain = 'https://www.imdb.com/'
start_soup = BeautifulSoup(start_url)

# scrape every item
for k in range(250):
    sub_html = start_soup.find_all('tbody')[0].find_all('a')[2*k+1].get('href')
    url = download(domain + sub_html)
    time.sleep(3)   
    tree = fromstring(url)
    soup = BeautifulSoup(url)
    name = soup.find('span',{
    
    'id':'titleYear'}).previous_sibling
    Name.append(name.replace(name[-1],''))
    Year.append(tree.xpath('//*[@id="titleYear"]/a')[0].text_content())
    Rate.append(tree.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/strong/span')[0].text_content())
    Level.append(soup.find('div',{
    
    'class':'subtext'}).span.previous_sibling.strip())
    try:
        Directors.append(soup.find(text='Director:').parent.parent.find('a').get_text())
    except AttributeError:
        directors = [k.get_text() for k in soup.find(text='Directors:').parent.parent.find_all('a')]
        Directors.append('/'.join(directors))
    try:
        writers = [k.get_text() for k in soup.find(text='Writers:').parent.parent.find_all('a')]
        Writers.append('/'.join(writers))
    except AttributeError:
        Writers.append(soup.find(text='Writer:').parent.parent.find('a').get_text())
    stars = [k.get_text() for k in soup.find(text='Stars:').parent.parent.find_all('a')]
    Stars.append('/'.join(stars))
    genres = [k.get_text().strip() for k in soup.find(text='Genres:').parent.parent.find_all('a')]    
    Genres.append('/'.join(genres))
    Runtime.append(soup.find(text='Runtime:').parent.parent.time.get_text())
    countries = [k.get_text() for k in soup.find(text='Country:').parent.parent.find_all('a')]
    Country.append('/'.join(countries))
    languages = [k.get_text() for k in soup.find(text='Language:').parent.parent.find_all('a')]
    Language.append('/'.join(languages)) 
    try:
        Budget.append(soup.find(text='Budget:').parent.next_sibling.strip())
    except AttributeError:
        Budget.append(None)
    try:
        Box_Office_USA.append(soup.find(text='Gross USA:').parent.next_sibling.strip())
    except AttributeError:
        Box_Office_USA.append(None)
    try:
        Box_Office_World.append(soup.find(text='Cumulative Worldwide Gross:').parent.next_sibling.strip())
    except AttributeError:
        Box_Office_World.append(None)
    
# combine each column
Name_pd = pd.DataFrame(Name)
Year_pd = pd.DataFrame(Year)
Rate_pd = pd.DataFrame(Rate)
Level_pd = pd.DataFrame(Level)
Directors_pd = pd.DataFrame(Directors)
Writers_pd = pd.DataFrame(Writers)
Stars_pd = pd.DataFrame(Stars)
Genres_pd = pd.DataFrame(Genres)
Runtime_pd = pd.DataFrame(Runtime)
Country_pd = pd.DataFrame(Country)
Language_pd = pd.DataFrame(Language)
Budget_pd = pd.DataFrame(Budget)
Box_Office_USA_pd = pd.DataFrame(Box_Office_USA)
Box_Office_World_pd = pd.DataFrame(Box_Office_World)
movie_data = pd.concat([Name_pd,Year_pd,Rate_pd,Level_pd,Directors_pd,Writers_pd,Stars_pd,Genres_pd,Runtime_pd,\
                        Country_pd,Language_pd,Budget_pd,Box_Office_USA_pd,Box_Office_World_pd],axis=1)
movie_data.columns=['Name','Year','Rate','Level','Directors','Writers','Stars','Genres','Runtime','Country',\
                    'Language','Budget','Box_Office_USA','Box_Office_World']


# output
outputpath='c:/Users/zxw/Desktop/修身/与自己/数据分析/数据分析/爬虫/movie.csv' ## The path need to be altered!
movie_data.to_csv(outputpath,sep=',',index=False,header=True,encoding='utf_8_sig')

Data preparation

Then, we need to import packages and load the data.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

movie_data = pd.read_csv('movie.csv')
movie_data.head()

	Name	Year	Rate	Level	Directors	Writers	Stars	Genres	Runtime	Country	Language	Budget	Box_Office_USA	Box_Office_World
0	The Shawshank Redemption	1994	9.3	R	Frank Darabont	Stephen King/Frank Darabont	Tim Robbins/Morgan Freeman/Bob Gunton/See full...	Drama	142 min	USA	English	$25,000,000	$28,699,976	$28,815,291
1	The Godfather	1972	9.2	NaN	Francis Ford Coppola	Mario Puzo/Francis Ford Coppola/1 more credit	Marlon Brando/Al Pacino/James Caan/See full ca...	Crime/Drama	175 min	USA	English/Italian/Latin	$6,000,000	$134,966,411	$246,120,974
2	The Godfather: Part II	1974	9.0	NaN	Francis Ford Coppola	Francis Ford Coppola/Mario Puzo/1 more credit	Al Pacino/Robert De Niro/Robert Duvall/See ful...	Crime/Drama	202 min	USA	English/Italian/Spanish/Latin/Sicilian	$13,000,000	$47,834,595	$48,035,783
3	The Dark Knight	2008	9.0	PG-13	Christopher Nolan	Jonathan Nolan/Christopher Nolan/3 more credits	Christian Bale/Heath Ledger/Aaron Eckhart/See ...	Action/Crime/Drama/Thriller	152 min	USA/UK	English/Mandarin	$185,000,000	$535,234,033	$1,005,456,758
4	12 Angry Men	1957	8.9	NaN	Sidney Lumet	Reginald Rose/Reginald Rose	Henry Fonda/Lee J. Cobb/Martin Balsam/See full...	Crime/Drama	96 min	USA	English	$350,000	NaN	$576

As we can see, the data above is so dirty that we need to clean it before analysis.

Data cleaning

Missing value

movie_data.isnull().sum()

Name                  0
Year                  0
Rate                  0
Level               104
Directors             0
Writers               0
Stars                 0
Genres                0
Runtime               0
Country               0
Language              0
Budget               23
Box_Office_USA       37
Box_Office_World     10
dtype: int64

# fill na with 'Not Known'
movie_data['Level'].fillna('Not Known',inplace = True)
movie_data['Directors'].fillna('Not Known',inplace = True)

Data wrangling

# delete redundant information in 'Writers'
def replace_1(x):
    to_replace = re.search('/[0-9]+ more credit',x)
    if to_replace == None:
        return x
    else:
        x_new = x.replace(to_replace.group(),'')
    return x_new
movie_data['Writers'] = movie_data['Writers'].apply(replace_1)

# delete redundant information in 'Stars'
def replace_2(x):
    to_replace = re.search('/See full cast \& crew',x)
    if to_replace == None:
        return x
    else:
        x_new = x.replace(to_replace.group(),'')
    return x_new
movie_data['Stars'] = movie_data['Stars'].apply(replace_2)

# extract runtime
def extract_runtime(x):
    return int(re.search('[0-9]+',x).group())
movie_data['Runtime'] = movie_data['Runtime'].apply(extract_runtime)

# extract money(For simplicity, ignore other currency except dollar)
def extract_number(x):
    try:
        if re.match('^\$',x):
            return float(''.join(re.findall('[0-9]',x)))
        else:
            return None
    except TypeError:
        return None
movie_data['Budget'] = movie_data['Budget'].apply(extract_number)
movie_data['Box_Office_USA'] = movie_data['Box_Office_USA'].apply(extract_number)
movie_data['Box_Office_World'] = movie_data['Box_Office_World'].apply(extract_number)

movie_data.head()

	Name	Year	Rate	Level	Directors	Writers	Stars	Genres	Runtime	Country	Language	Budget	Box_Office_USA	Box_Office_World
0	The Shawshank Redemption	1994	9.3	R	Frank Darabont	Stephen King/Frank Darabont	Tim Robbins/Morgan Freeman/Bob Gunton	Drama	142	USA	English	25000000.0	28699976.0	2.881529e+07
1	The Godfather	1972	9.2	Not Known	Francis Ford Coppola	Mario Puzo/Francis Ford Coppola	Marlon Brando/Al Pacino/James Caan	Crime/Drama	175	USA	English/Italian/Latin	6000000.0	134966411.0	2.461210e+08
2	The Godfather: Part II	1974	9.0	Not Known	Francis Ford Coppola	Francis Ford Coppola/Mario Puzo	Al Pacino/Robert De Niro/Robert Duvall	Crime/Drama	202	USA	English/Italian/Spanish/Latin/Sicilian	13000000.0	47834595.0	4.803578e+07
3	The Dark Knight	2008	9.0	PG-13	Christopher Nolan	Jonathan Nolan/Christopher Nolans	Christian Bale/Heath Ledger/Aaron Eckhart	Action/Crime/Drama/Thriller	152	USA/UK	English/Mandarin	185000000.0	535234033.0	1.005457e+09
4	12 Angry Men	1957	8.9	Not Known	Sidney Lumet	Reginald Rose/Reginald Rose	Henry Fonda/Lee J. Cobb/Martin Balsam	Crime/Drama	96	USA	English	350000.0	NaN	5.760000e+02

Now we get the clean data and we can make some data analysis and visualization on it.

Data analysis and visualization

Number of top250 movies every year

year_counts = movie_data['Year'].value_counts()
year_counts.columns=['Year','Counts']
plt.figure(figsize=(15, 6.5))
year_counts.sort_index().plot.line(title='Number of top250 movies every year')

There are three peaks during the years.

The distribution of Rate

def get_histgram(x,n):
    movie_data[x].plot.hist(bins = n, title = 'The histgram of {}'.format(x),figsize = (15,6.5))

get_histgram('Rate',15)

As we can see, it is right skewed.

Percent of each level

level_counts = movie_data['Level'].value_counts()
level_counts.columns=['Level','Counts']
level_counts.plot.pie(figsize = (8,8),title = 'Pie chart of the Level',legend = True)

Top 10 directors

def get_bar_chart(x):
    List = movie_data[x].apply(f)
    items = []
    for element in List:
        for item in element:
            item = item.replace(" ", "")
            items.append(item)
    item_pd = pd.Series(items)
    item_pd.value_counts().head(10).plot.bar(figsize = (15,6.5),title = 'Top 10 {}'.format(x))

get_bar_chart('Directors')

Top 10 Writers

get_bar_chart('Writers')

Top 10 Stars

get_bar_chart('Stars')

The distribution of runtime

get_histgram('Runtime',20)

Percent of each genre

def get_pie_chart(x):
    List = movie_data[x].apply(f)
    items = []
    for element in List:
        for item in element:
            item = item.replace(" ", "")
            items.append(item)
    item_pd = pd.Series(items)
    item_counts = item_pd.value_counts()
    item_counts.columns=[x,'Counts']
    item_counts.plot.pie(figsize = (10,10),title = 'Pie chart of the {}'.format(x),legend = True)

get_pie_chart('Genres')

Percent of each country

get_pie_chart('Country')

Percent of each language

get_pie_chart('Language')

The distribution of budget

get_histgram('Budget',20)

The distribution of Box_Office_USA

get_histgram('Box_Office_USA',20)

The distribution of Box_Office_World

get_histgram('Box_Office_World',20)

Correlation among quantitative features

movie_data.corr()

	Year	Rate	Runtime	Budget	Box_Office_USA	Box_Office_World
Year	1.000000	0.021213	0.160196	0.478785	0.312610	0.391026
Rate	0.021213	1.000000	0.244455	0.114680	0.215052	0.208516
Runtime	0.160196	0.244455	1.000000	0.178348	0.138281	0.131039
Budget	0.478785	0.114680	0.178348	1.000000	0.790742	0.841518
Box_Office_USA	0.312610	0.215052	0.138281	0.790742	1.000000	0.949837
Box_Office_World	0.391026	0.208516	0.131039	0.841518	0.949837	1.000000

Top 5 budget movie

movie_data[['Name','Budget']].sort_values(by = 'Budget',ascending = False).head()

	Name	Budget
73	Avengers: Endgame	356000000.0
63	Avengers: Infinity War	321000000.0
70	The Dark Knight Rises	250000000.0
110	Toy Story 3	200000000.0
3	The Dark Knight	185000000.0

Top 5 Box Office movie of USA

movie_data[['Name','Box_Office_USA']].sort_values(by = 'Box_Office_USA',ascending = False).head()

	Name	Box_Office_USA
73	Avengers: Endgame	858373000.0
63	Avengers: Infinity War	678815482.0
3	The Dark Knight	535234033.0
24	Star Wars	460998507.0
70	The Dark Knight Rises	448139099.0

Top 5 Box Office movie of the world

movie_data[['Name','Box_Office_World']].sort_values(by = 'Box_Office_World',ascending = False).head()

	Name	Box_Office_World
73	Avengers: Endgame	2.797801e+09
63	Avengers: Infinity War	2.048360e+09
212	Harry Potter and the Deathly Hallows: Part 2	1.342193e+09
6	The Lord of the Rings: The Return of the King	1.142271e+09
70	The Dark Knight Rises	1.081141e+09

USA Box office Percent

movie_data['USA_percent'] = movie_data['Box_Office_USA']/movie_data['Box_Office_World']
get_histgram('USA_percent',20)

Most commercial successful movie

movie_data['Earning_rate'] = movie_data['Box_Office_World']/movie_data['Budget']-1
movie_data[['Name','Earning_rate']].sort_values(by = 'Earning_rate',ascending = False).head()

	Name	Earning_rate
230	Rocky	121.134309
166	Gone with the Wind	100.169872
24	Star Wars	69.524447
113	Jodaeiye Nader az Simin	44.852152
1	The Godfather	40.020162

Movie related to China

logic_list = []
for movie in movie_data['Country']:
    logic_list.append('China' in movie or 'HongKong' in movie)
Chinese_movie_list = movie_data.loc[logic_list]
Chinese_movie_list

	Name	Year	Rate	Level	Directors	Writers	Stars	Genres	Runtime	Country	Language	Budget	Box_Office_USA	Box_Office_World	USA_percent	Earning_rate
96	1917	2019	8.3	R	Sam Mendes	Sam Mendes/Krysty Wilson-Cairns	Dean-Charles Chapman/George MacKay/Daniel Mays	Drama/War	119	USA/UK/India/Spain/Canada/China	English/French/German	95000000.0	159227644.0	384792488.0	0.413801	3.050447
129	Green Book	2018	8.2	PG-13	Peter Farrelly	Nick Vallelonga/Brian Hayes Currie	Viggo Mortensen/Mahershala Ali/Linda Cardellini	Biography/Comedy/Drama/Music	130	USA/China	English/Italian/Russian/German	23000000.0	85080171.0	321752656.0	0.264427	12.989246
236	Fa yeung nin wah	2000	8.1	PG	Kar-Wai Wong	Kar-Wai Wong	Tony Chiu-Wai Leung/Maggie Cheung/Ping Lam Siu	Drama/Romance	98	Hong Kong/China	Cantonese/Shanghainese/French/Spanish	NaN	2738980.0	12854953.0	0.213068	NaN