This article deals with the top 250 movies in IMDB , including data scraping, data preparation, data cleaning, data analysis and visualization.
Data scraping
First, we need to scrape the data from this website .
import pandas as pd
import time
import urllib. request
from lxml. html import fromstring
from bs4 import BeautifulSoup
def download ( url) :
print ( 'Downloading:' , url)
request = urllib. request. Request( url)
request. add_header( 'User-agent' , 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' )
resp = urllib. request. urlopen( request)
html = resp. read( ) . decode( 'utf-8' )
return html
Name = [ ]
Year = [ ]
Rate = [ ]
Level = [ ]
Directors = [ ]
Writers = [ ]
Stars = [ ]
Genres = [ ]
Runtime = [ ]
Country = [ ]
Language = [ ]
Budget = [ ]
Box_Office_USA = [ ]
Box_Office_World = [ ]
start_url = download( 'https://www.imdb.com/chart/top/?ref_=nv_mv_250' )
domain = 'https://www.imdb.com/'
start_soup = BeautifulSoup( start_url)
for k in range ( 250 ) :
sub_html = start_soup. find_all( 'tbody' ) [ 0 ] . find_all( 'a' ) [ 2 * k+ 1 ] . get( 'href' )
url = download( domain + sub_html)
time. sleep( 3 )
tree = fromstring( url)
soup = BeautifulSoup( url)
name = soup. find( 'span' , {
'id' : 'titleYear' } ) . previous_sibling
Name. append( name. replace( name[ - 1 ] , '' ) )
Year. append( tree. xpath( '//*[@id="titleYear"]/a' ) [ 0 ] . text_content( ) )
Rate. append( tree. xpath( '//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/strong/span' ) [ 0 ] . text_content( ) )
Level. append( soup. find( 'div' , {
'class' : 'subtext' } ) . span. previous_sibling. strip( ) )
try :
Directors. append( soup. find( text= 'Director:' ) . parent. parent. find( 'a' ) . get_text( ) )
except AttributeError:
directors = [ k. get_text( ) for k in soup. find( text= 'Directors:' ) . parent. parent. find_all( 'a' ) ]
Directors. append( '/' . join( directors) )
try :
writers = [ k. get_text( ) for k in soup. find( text= 'Writers:' ) . parent. parent. find_all( 'a' ) ]
Writers. append( '/' . join( writers) )
except AttributeError:
Writers. append( soup. find( text= 'Writer:' ) . parent. parent. find( 'a' ) . get_text( ) )
stars = [ k. get_text( ) for k in soup. find( text= 'Stars:' ) . parent. parent. find_all( 'a' ) ]
Stars. append( '/' . join( stars) )
genres = [ k. get_text( ) . strip( ) for k in soup. find( text= 'Genres:' ) . parent. parent. find_all( 'a' ) ]
Genres. append( '/' . join( genres) )
Runtime. append( soup. find( text= 'Runtime:' ) . parent. parent. time. get_text( ) )
countries = [ k. get_text( ) for k in soup. find( text= 'Country:' ) . parent. parent. find_all( 'a' ) ]
Country. append( '/' . join( countries) )
languages = [ k. get_text( ) for k in soup. find( text= 'Language:' ) . parent. parent. find_all( 'a' ) ]
Language. append( '/' . join( languages) )
try :
Budget. append( soup. find( text= 'Budget:' ) . parent. next_sibling. strip( ) )
except AttributeError:
Budget. append( None )
try :
Box_Office_USA. append( soup. find( text= 'Gross USA:' ) . parent. next_sibling. strip( ) )
except AttributeError:
Box_Office_USA. append( None )
try :
Box_Office_World. append( soup. find( text= 'Cumulative Worldwide Gross:' ) . parent. next_sibling. strip( ) )
except AttributeError:
Box_Office_World. append( None )
Name_pd = pd. DataFrame( Name)
Year_pd = pd. DataFrame( Year)
Rate_pd = pd. DataFrame( Rate)
Level_pd = pd. DataFrame( Level)
Directors_pd = pd. DataFrame( Directors)
Writers_pd = pd. DataFrame( Writers)
Stars_pd = pd. DataFrame( Stars)
Genres_pd = pd. DataFrame( Genres)
Runtime_pd = pd. DataFrame( Runtime)
Country_pd = pd. DataFrame( Country)
Language_pd = pd. DataFrame( Language)
Budget_pd = pd. DataFrame( Budget)
Box_Office_USA_pd = pd. DataFrame( Box_Office_USA)
Box_Office_World_pd = pd. DataFrame( Box_Office_World)
movie_data = pd. concat( [ Name_pd, Year_pd, Rate_pd, Level_pd, Directors_pd, Writers_pd, Stars_pd, Genres_pd, Runtime_pd, \
Country_pd, Language_pd, Budget_pd, Box_Office_USA_pd, Box_Office_World_pd] , axis= 1 )
movie_data. columns= [ 'Name' , 'Year' , 'Rate' , 'Level' , 'Directors' , 'Writers' , 'Stars' , 'Genres' , 'Runtime' , 'Country' , \
'Language' , 'Budget' , 'Box_Office_USA' , 'Box_Office_World' ]
outputpath= 'c:/Users/zxw/Desktop/修身/与自己/数据分析/数据分析/爬虫/movie.csv'
movie_data. to_csv( outputpath, sep= ',' , index= False , header= True , encoding= 'utf_8_sig' )
Data preparation
Then, we need to import packages and load the data.
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
import re
movie_data = pd. read_csv( 'movie.csv' )
movie_data. head( )
Name
Year
Rate
Level
Directors
Writers
Stars
Genres
Runtime
Country
Language
Budget
Box_Office_USA
Box_Office_World
0
The Shawshank Redemption
1994
9.3
R
Frank Darabont
Stephen King/Frank Darabont
Tim Robbins/Morgan Freeman/Bob Gunton/See full...
Drama
142 min
USA
English
$25,000,000
$28,699,976
$28,815,291
1
The Godfather
1972
9.2
NaN
Francis Ford Coppola
Mario Puzo/Francis Ford Coppola/1 more credit
Marlon Brando/Al Pacino/James Caan/See full ca...
Crime/Drama
175 min
USA
English/Italian/Latin
$6,000,000
$134,966,411
$246,120,974
2
The Godfather: Part II
1974
9.0
NaN
Francis Ford Coppola
Francis Ford Coppola/Mario Puzo/1 more credit
Al Pacino/Robert De Niro/Robert Duvall/See ful...
Crime/Drama
202 min
USA
English/Italian/Spanish/Latin/Sicilian
$13,000,000
$47,834,595
$48,035,783
3
The Dark Knight
2008
9.0
PG-13
Christopher Nolan
Jonathan Nolan/Christopher Nolan/3 more credits
Christian Bale/Heath Ledger/Aaron Eckhart/See ...
Action/Crime/Drama/Thriller
152 min
USA/UK
English/Mandarin
$185,000,000
$535,234,033
$1,005,456,758
4
12 Angry Men
1957
8.9
NaN
Sidney Lumet
Reginald Rose/Reginald Rose
Henry Fonda/Lee J. Cobb/Martin Balsam/See full...
Crime/Drama
96 min
USA
English
$350,000
NaN
$576
As we can see, the data above is so dirty that we need to clean it before analysis.
Data cleaning
Missing value
movie_data. isnull( ) . sum ( )
Name 0
Year 0
Rate 0
Level 104
Directors 0
Writers 0
Stars 0
Genres 0
Runtime 0
Country 0
Language 0
Budget 23
Box_Office_USA 37
Box_Office_World 10
dtype: int64
movie_data[ 'Level' ] . fillna( 'Not Known' , inplace = True )
movie_data[ 'Directors' ] . fillna( 'Not Known' , inplace = True )
Data wrangling
def replace_1 ( x) :
to_replace = re. search( '/[0-9]+ more credit' , x)
if to_replace == None :
return x
else :
x_new = x. replace( to_replace. group( ) , '' )
return x_new
movie_data[ 'Writers' ] = movie_data[ 'Writers' ] . apply ( replace_1)
def replace_2 ( x) :
to_replace = re. search( '/See full cast \& crew' , x)
if to_replace == None :
return x
else :
x_new = x. replace( to_replace. group( ) , '' )
return x_new
movie_data[ 'Stars' ] = movie_data[ 'Stars' ] . apply ( replace_2)
def extract_runtime ( x) :
return int ( re. search( '[0-9]+' , x) . group( ) )
movie_data[ 'Runtime' ] = movie_data[ 'Runtime' ] . apply ( extract_runtime)
def extract_number ( x) :
try :
if re. match( '^\$' , x) :
return float ( '' . join( re. findall( '[0-9]' , x) ) )
else :
return None
except TypeError:
return None
movie_data[ 'Budget' ] = movie_data[ 'Budget' ] . apply ( extract_number)
movie_data[ 'Box_Office_USA' ] = movie_data[ 'Box_Office_USA' ] . apply ( extract_number)
movie_data[ 'Box_Office_World' ] = movie_data[ 'Box_Office_World' ] . apply ( extract_number)
movie_data. head( )
Name
Year
Rate
Level
Directors
Writers
Stars
Genres
Runtime
Country
Language
Budget
Box_Office_USA
Box_Office_World
0
The Shawshank Redemption
1994
9.3
R
Frank Darabont
Stephen King/Frank Darabont
Tim Robbins/Morgan Freeman/Bob Gunton
Drama
142
USA
English
25000000.0
28699976.0
2.881529e+07
1
The Godfather
1972
9.2
Not Known
Francis Ford Coppola
Mario Puzo/Francis Ford Coppola
Marlon Brando/Al Pacino/James Caan
Crime/Drama
175
USA
English/Italian/Latin
6000000.0
134966411.0
2.461210e+08
2
The Godfather: Part II
1974
9.0
Not Known
Francis Ford Coppola
Francis Ford Coppola/Mario Puzo
Al Pacino/Robert De Niro/Robert Duvall
Crime/Drama
202
USA
English/Italian/Spanish/Latin/Sicilian
13000000.0
47834595.0
4.803578e+07
3
The Dark Knight
2008
9.0
PG-13
Christopher Nolan
Jonathan Nolan/Christopher Nolans
Christian Bale/Heath Ledger/Aaron Eckhart
Action/Crime/Drama/Thriller
152
USA/UK
English/Mandarin
185000000.0
535234033.0
1.005457e+09
4
12 Angry Men
1957
8.9
Not Known
Sidney Lumet
Reginald Rose/Reginald Rose
Henry Fonda/Lee J. Cobb/Martin Balsam
Crime/Drama
96
USA
English
350000.0
NaN
5.760000e+02
Now we get the clean data and we can make some data analysis and visualization on it.
Data analysis and visualization
Number of top250 movies every year
year_counts = movie_data[ 'Year' ] . value_counts( )
year_counts. columns= [ 'Year' , 'Counts' ]
plt. figure( figsize= ( 15 , 6.5 ) )
year_counts. sort_index( ) . plot. line( title= 'Number of top250 movies every year' )
There are three peaks during the years.
The distribution of Rate
def get_histgram ( x, n) :
movie_data[ x] . plot. hist( bins = n, title = 'The histgram of {}' . format ( x) , figsize = ( 15 , 6.5 ) )
get_histgram( 'Rate' , 15 )
As we can see, it is right skewed.
Percent of each level
level_counts = movie_data[ 'Level' ] . value_counts( )
level_counts. columns= [ 'Level' , 'Counts' ]
level_counts. plot. pie( figsize = ( 8 , 8 ) , title = 'Pie chart of the Level' , legend = True )
Top 10 directors
def get_bar_chart ( x) :
List = movie_data[ x] . apply ( f)
items = [ ]
for element in List:
for item in element:
item = item. replace( " " , "" )
items. append( item)
item_pd = pd. Series( items)
item_pd. value_counts( ) . head( 10 ) . plot. bar( figsize = ( 15 , 6.5 ) , title = 'Top 10 {}' . format ( x) )
get_bar_chart( 'Directors' )
Top 10 Writers
get_bar_chart( 'Writers' )
Top 10 Stars
get_bar_chart( 'Stars' )
The distribution of runtime
get_histgram( 'Runtime' , 20 )
Percent of each genre
def get_pie_chart ( x) :
List = movie_data[ x] . apply ( f)
items = [ ]
for element in List:
for item in element:
item = item. replace( " " , "" )
items. append( item)
item_pd = pd. Series( items)
item_counts = item_pd. value_counts( )
item_counts. columns= [ x, 'Counts' ]
item_counts. plot. pie( figsize = ( 10 , 10 ) , title = 'Pie chart of the {}' . format ( x) , legend = True )
get_pie_chart( 'Genres' )
Percent of each country
get_pie_chart( 'Country' )
Percent of each language
get_pie_chart( 'Language' )
The distribution of budget
get_histgram( 'Budget' , 20 )
The distribution of Box_Office_USA
get_histgram( 'Box_Office_USA' , 20 )
The distribution of Box_Office_World
get_histgram( 'Box_Office_World' , 20 )
Correlation among quantitative features
movie_data. corr( )
Year
Rate
Runtime
Budget
Box_Office_USA
Box_Office_World
Year
1.000000
0.021213
0.160196
0.478785
0.312610
0.391026
Rate
0.021213
1.000000
0.244455
0.114680
0.215052
0.208516
Runtime
0.160196
0.244455
1.000000
0.178348
0.138281
0.131039
Budget
0.478785
0.114680
0.178348
1.000000
0.790742
0.841518
Box_Office_USA
0.312610
0.215052
0.138281
0.790742
1.000000
0.949837
Box_Office_World
0.391026
0.208516
0.131039
0.841518
0.949837
1.000000
Top 5 budget movie
movie_data[ [ 'Name' , 'Budget' ] ] . sort_values( by = 'Budget' , ascending = False ) . head( )
Name
Budget
73
Avengers: Endgame
356000000.0
63
Avengers: Infinity War
321000000.0
70
The Dark Knight Rises
250000000.0
110
Toy Story 3
200000000.0
3
The Dark Knight
185000000.0
Top 5 Box Office movie of USA
movie_data[ [ 'Name' , 'Box_Office_USA' ] ] . sort_values( by = 'Box_Office_USA' , ascending = False ) . head( )
Name
Box_Office_USA
73
Avengers: Endgame
858373000.0
63
Avengers: Infinity War
678815482.0
3
The Dark Knight
535234033.0
24
Star Wars
460998507.0
70
The Dark Knight Rises
448139099.0
Top 5 Box Office movie of the world
movie_data[ [ 'Name' , 'Box_Office_World' ] ] . sort_values( by = 'Box_Office_World' , ascending = False ) . head( )
Name
Box_Office_World
73
Avengers: Endgame
2.797801e+09
63
Avengers: Infinity War
2.048360e+09
212
Harry Potter and the Deathly Hallows: Part 2
1.342193e+09
6
The Lord of the Rings: The Return of the King
1.142271e+09
70
The Dark Knight Rises
1.081141e+09
USA Box office Percent
movie_data[ 'USA_percent' ] = movie_data[ 'Box_Office_USA' ] / movie_data[ 'Box_Office_World' ]
get_histgram( 'USA_percent' , 20 )
Most commercial successful movie
movie_data[ 'Earning_rate' ] = movie_data[ 'Box_Office_World' ] / movie_data[ 'Budget' ] - 1
movie_data[ [ 'Name' , 'Earning_rate' ] ] . sort_values( by = 'Earning_rate' , ascending = False ) . head( )
Name
Earning_rate
230
Rocky
121.134309
166
Gone with the Wind
100.169872
24
Star Wars
69.524447
113
Jodaeiye Nader az Simin
44.852152
1
The Godfather
40.020162
Movie related to China
logic_list = [ ]
for movie in movie_data[ 'Country' ] :
logic_list. append( 'China' in movie or 'HongKong' in movie)
Chinese_movie_list = movie_data. loc[ logic_list]
Chinese_movie_list
Name
Year
Rate
Level
Directors
Writers
Stars
Genres
Runtime
Country
Language
Budget
Box_Office_USA
Box_Office_World
USA_percent
Earning_rate
96
1917
2019
8.3
R
Sam Mendes
Sam Mendes/Krysty Wilson-Cairns
Dean-Charles Chapman/George MacKay/Daniel Mays
Drama/War
119
USA/UK/India/Spain/Canada/China
English/French/German
95000000.0
159227644.0
384792488.0
0.413801
3.050447
129
Green Book
2018
8.2
PG-13
Peter Farrelly
Nick Vallelonga/Brian Hayes Currie
Viggo Mortensen/Mahershala Ali/Linda Cardellini
Biography/Comedy/Drama/Music
130
USA/China
English/Italian/Russian/German
23000000.0
85080171.0
321752656.0
0.264427
12.989246
236
Fa yeung nin wah
2000
8.1
PG
Kar-Wai Wong
Kar-Wai Wong
Tony Chiu-Wai Leung/Maggie Cheung/Ping Lam Siu
Drama/Romance
98
Hong Kong/China
Cantonese/Shanghainese/French/Spanish
NaN
2738980.0
12854953.0
0.213068
NaN