# 读取movie_altered数据集
movie = pd.read_csv('data/movie_altered.csv')
movie.head()
title |
rating |
year |
duration |
director_1 |
director_fb_likes_1 |
actor_1 |
actor_2 |
actor_3 |
actor_fb_likes_1 |
actor_fb_likes_2 |
actor_fb_likes_3 |
0 |
Avatar |
PG-13 |
2009.0 |
178.0 |
James Cameron |
0.0 |
CCH Pounder |
Joel David Moore |
Wes Studi |
1000.0 |
936.0 |
855.0 |
1 |
Pirates of the Caribbean: At World's End |
PG-13 |
2007.0 |
169.0 |
Gore Verbinski |
563.0 |
Johnny Depp |
Orlando Bloom |
Jack Davenport |
40000.0 |
5000.0 |
1000.0 |
2 |
Spectre |
PG-13 |
2015.0 |
148.0 |
Sam Mendes |
0.0 |
Christoph Waltz |
Rory Kinnear |
Stephanie Sigman |
11000.0 |
393.0 |
161.0 |
3 |
The Dark Knight Rises |
PG-13 |
2012.0 |
164.0 |
Christopher Nolan |
22000.0 |
Tom Hardy |
Christian Bale |
Joseph Gordon-Levitt |
27000.0 |
23000.0 |
23000.0 |
4 |
Star Wars: Episode VII - The Force Awakens |
NaN |
NaN |
NaN |
Doug Walker |
131.0 |
Doug Walker |
Rob Walker |
NaN |
131.0 |
12.0 |
NaN |
# 插入新的列,用来标识每一部电影
movie.insert(0, 'id', np.arange(len(movie)))
movie.head()
|
id |
title |
rating |
year |
duration |
director_1 |
director_fb_likes_1 |
actor_1 |
actor_2 |
actor_3 |
actor_fb_likes_1 |
actor_fb_likes_2 |
actor_fb_likes_3 |
0 |
0 |
Avatar |
PG-13 |
2009.0 |
178.0 |
James Cameron |
0.0 |
CCH Pounder |
Joel David Moore |
Wes Studi |
1000.0 |
936.0 |
855.0 |
1 |
1 |
Pirates of the Caribbean: At World's End |
PG-13 |
2007.0 |
169.0 |
Gore Verbinski |
563.0 |
Johnny Depp |
Orlando Bloom |
Jack Davenport |
40000.0 |
5000.0 |
1000.0 |
2 |
2 |
Spectre |
PG-13 |
2015.0 |
148.0 |
Sam Mendes |
0.0 |
Christoph Waltz |
Rory Kinnear |
Stephanie Sigman |
11000.0 |
393.0 |
161.0 |
3 |
3 |
The Dark Knight Rises |
PG-13 |
2012.0 |
164.0 |
Christopher Nolan |
22000.0 |
Tom Hardy |
Christian Bale |
Joseph Gordon-Levitt |
27000.0 |
23000.0 |
23000.0 |
4 |
4 |
Star Wars: Episode VII - The Force Awakens |
NaN |
NaN |
NaN |
Doug Walker |
131.0 |
Doug Walker |
Rob Walker |
NaN |
131.0 |
12.0 |
NaN |
# 用wide_to_long,将所有演员放到一列,将所有Facebook likes放到一列
stubnames = ['director', 'director_fb_likes', 'actor', 'actor_fb_likes']
movie_long = pd.wide_to_long(movie,
stubnames=stubnames,
i='id',
j='num',
sep='_').reset_index()
movie_long['num'] = movie_long['num'].astype(int)
movie_long.head(9)
|
id |
num |
title |
duration |
year |
rating |
director |
director_fb_likes |
actor |
actor_fb_likes |
0 |
0 |
1 |
Avatar |
178.0 |
2009.0 |
PG-13 |
James Cameron |
0.0 |
CCH Pounder |
1000.0 |
1 |
0 |
2 |
Avatar |
178.0 |
2009.0 |
PG-13 |
NaN |
NaN |
Joel David Moore |
936.0 |
2 |
0 |
3 |
Avatar |
178.0 |
2009.0 |
PG-13 |
NaN |
NaN |
Wes Studi |
855.0 |
3 |
1 |
1 |
Pirates of the Caribbean: At World's End |
169.0 |
2007.0 |
PG-13 |
Gore Verbinski |
563.0 |
Johnny Depp |
40000.0 |
4 |
1 |
2 |
Pirates of the Caribbean: At World's End |
169.0 |
2007.0 |
PG-13 |
NaN |
NaN |
Orlando Bloom |
5000.0 |
5 |
1 |
3 |
Pirates of the Caribbean: At World's End |
169.0 |
2007.0 |
PG-13 |
NaN |
NaN |
Jack Davenport |
1000.0 |
6 |
2 |
1 |
Spectre |
148.0 |
2015.0 |
PG-13 |
Sam Mendes |
0.0 |
Christoph Waltz |
11000.0 |
7 |
2 |
2 |
Spectre |
148.0 |
2015.0 |
PG-13 |
NaN |
NaN |
Rory Kinnear |
393.0 |
8 |
2 |
3 |
Spectre |
148.0 |
2015.0 |
PG-13 |
NaN |
NaN |
Stephanie Sigman |
161.0 |
# 将这个数据分解成多个小表
movie_table = movie_long[['id','title', 'year', 'duration', 'rating']]
director_table = movie_long[['id', 'director', 'num', 'director_fb_likes']]
actor_table = movie_long[['id', 'actor', 'num', 'actor_fb_likes']]
movie_table.head(9)
|
id |
title |
year |
duration |
rating |
0 |
0 |
Avatar |
2009.0 |
178.0 |
PG-13 |
1 |
0 |
Avatar |
2009.0 |
178.0 |
PG-13 |
2 |
0 |
Avatar |
2009.0 |
178.0 |
PG-13 |
3 |
1 |
Pirates of the Caribbean: At World's End |
2007.0 |
169.0 |
PG-13 |
4 |
1 |
Pirates of the Caribbean: At World's End |
2007.0 |
169.0 |
PG-13 |
5 |
1 |
Pirates of the Caribbean: At World's End |
2007.0 |
169.0 |
PG-13 |
6 |
2 |
Spectre |
2015.0 |
148.0 |
PG-13 |
7 |
2 |
Spectre |
2015.0 |
148.0 |
PG-13 |
8 |
2 |
Spectre |
2015.0 |
148.0 |
PG-13 |
director_table.head(9)
|
id |
director |
num |
director_fb_likes |
0 |
0 |
James Cameron |
1 |
0.0 |
1 |
0 |
NaN |
2 |
NaN |
2 |
0 |
NaN |
3 |
NaN |
3 |
1 |
Gore Verbinski |
1 |
563.0 |
4 |
1 |
NaN |
2 |
NaN |
5 |
1 |
NaN |
3 |
NaN |
6 |
2 |
Sam Mendes |
1 |
0.0 |
7 |
2 |
NaN |
2 |
NaN |
8 |
2 |
NaN |
3 |
NaN |
actor_table.head(9)
|
id |
actor |
num |
actor_fb_likes |
0 |
0 |
CCH Pounder |
1 |
1000.0 |
1 |
0 |
Joel David Moore |
2 |
936.0 |
2 |
0 |
Wes Studi |
3 |
855.0 |
3 |
1 |
Johnny Depp |
1 |
40000.0 |
4 |
1 |
Orlando Bloom |
2 |
5000.0 |
5 |
1 |
Jack Davenport |
3 |
1000.0 |
6 |
2 |
Christoph Waltz |
1 |
11000.0 |
7 |
2 |
Rory Kinnear |
2 |
393.0 |
8 |
2 |
Stephanie Sigman |
3 |
161.0 |
# 做一些去重和去除缺失值的工作
movie_table = movie_table.drop_duplicates().reset_index(drop=True)
director_table = director_table.dropna().reset_index(drop=True)
actor_table = actor_table.dropna().reset_index(drop=True)
movie_table.head()
|
id |
title |
year |
duration |
rating |
0 |
0 |
Avatar |
2009.0 |
178.0 |
PG-13 |
1 |
1 |
Pirates of the Caribbean: At World's End |
2007.0 |
169.0 |
PG-13 |
2 |
2 |
Spectre |
2015.0 |
148.0 |
PG-13 |
3 |
3 |
The Dark Knight Rises |
2012.0 |
164.0 |
PG-13 |
4 |
4 |
Star Wars: Episode VII - The Force Awakens |
NaN |
NaN |
NaN |
director_table.head()
|
id |
director |
num |
director_fb_likes |
0 |
0 |
James Cameron |
1 |
0.0 |
1 |
1 |
Gore Verbinski |
1 |
563.0 |
2 |
2 |
Sam Mendes |
1 |
0.0 |
3 |
3 |
Christopher Nolan |
1 |
22000.0 |
4 |
4 |
Doug Walker |
1 |
131.0 |
# 比较内存的使用量 ##如果设置为 TRUE,获取系统分配的真实内存尺寸
movie.memory_usage(deep=True).sum()
#2289818
movie_table.memory_usage(deep=True).sum() + \
director_table.memory_usage(deep=True).sum() + \
actor_table.memory_usage(deep=True).sum()
#2538166
# 创建演员和导演的id列
director_cat = pd.Categorical(director_table['director'])
director_table.insert(1, 'director_id', director_cat.codes)
actor_cat = pd.Categorical(actor_table['actor'])
actor_table.insert(1, 'actor_id', actor_cat.codes)
director_table.head()
|
id |
director_id |
director |
num |
director_fb_likes |
0 |
0 |
922 |
James Cameron |
1 |
0.0 |
1 |
1 |
794 |
Gore Verbinski |
1 |
563.0 |
2 |
2 |
2020 |
Sam Mendes |
1 |
0.0 |
3 |
3 |
373 |
Christopher Nolan |
1 |
22000.0 |
4 |
4 |
600 |
Doug Walker |
1 |
131.0 |
actor_table.head()
|
id |
actor_id |
actor |
num |
actor_fb_likes |
0 |
0 |
824 |
CCH Pounder |
1 |
1000.0 |
1 |
0 |
2867 |
Joel David Moore |
2 |
936.0 |
2 |
0 |
6099 |
Wes Studi |
3 |
855.0 |
3 |
1 |
2971 |
Johnny Depp |
1 |
40000.0 |
4 |
1 |
4536 |
Orlando Bloom |
2 |
5000.0 |
# 可以用这两张表生成要用的中间表。先来做director表
director_associative = director_table[['id', 'director_id', 'num']]
dcols = ['director_id', 'director', 'director_fb_likes']
director_unique = director_table[dcols].drop_duplicates().reset_index(drop=True)
director_associative.head()
|
id |
director_id |
num |
0 |
0 |
922 |
1 |
1 |
1 |
794 |
1 |
2 |
2 |
2020 |
1 |
3 |
3 |
373 |
1 |
4 |
4 |
600 |
1 |
director_unique.head()
|
director_id |
director |
director_fb_likes |
0 |
922 |
James Cameron |
0.0 |
1 |
794 |
Gore Verbinski |
563.0 |
2 |
2020 |
Sam Mendes |
0.0 |
3 |
373 |
Christopher Nolan |
22000.0 |
4 |
600 |
Doug Walker |
131.0 |
# 再来做actor表
actor_associative = actor_table[['id', 'actor_id', 'num']]
acols = ['actor_id', 'actor', 'actor_fb_likes']
actor_unique = actor_table[acols].drop_duplicates().reset_index(drop=True)
actor_associative.head()
|
id |
actor_id |
num |
0 |
0 |
824 |
1 |
1 |
0 |
2867 |
2 |
2 |
0 |
6099 |
3 |
3 |
1 |
2971 |
1 |
4 |
1 |
4536 |
2 |
actor_unique.head()
|
actor_id |
actor |
actor_fb_likes |
0 |
824 |
CCH Pounder |
1000.0 |
1 |
2867 |
Joel David Moore |
936.0 |
2 |
6099 |
Wes Studi |
855.0 |
3 |
2971 |
Johnny Depp |
40000.0 |
4 |
4536 |
Orlando Bloom |
5000.0 |
# 查看新的表所使用的内存量
movie_table.memory_usage(deep=True).sum() + \
director_associative.memory_usage(deep=True).sum() + \
director_unique.memory_usage(deep=True).sum() + \
actor_associative.memory_usage(deep=True).sum() + \
actor_unique.memory_usage(deep=True).sum()
#1746766
movie_table.head()
|
id |
title |
year |
duration |
rating |
0 |
0 |
Avatar |
2009.0 |
178.0 |
PG-13 |
1 |
1 |
Pirates of the Caribbean: At World's End |
2007.0 |
169.0 |
PG-13 |
2 |
2 |
Spectre |
2015.0 |
148.0 |
PG-13 |
3 |
3 |
The Dark Knight Rises |
2012.0 |
164.0 |
PG-13 |
4 |
4 |
Star Wars: Episode VII - The Force Awakens |
NaN |
NaN |
NaN |
# 可以通过将左右表组合起来形成movie表。首先将附表与actor/director表结合,
#然后将num列pivot,再加上列的前缀
actors = actor_associative.merge(actor_unique, on='actor_id') \
.drop('actor_id', 1) \
.pivot_table(index='id', columns='num', aggfunc='first')
actors.columns = actors.columns.get_level_values(0) + '_' + \
actors.columns.get_level_values(1).astype(str)
directors = director_associative.merge(director_unique, on='director_id') \
.drop('director_id', 1) \
.pivot_table(index='id', columns='num', aggfunc='first')
directors.columns = directors.columns.get_level_values(0) + '_' + \
directors.columns.get_level_values(1).astype(str)
actors.head()
|
actor_1 |
actor_2 |
actor_3 |
actor_fb_likes_1 |
actor_fb_likes_2 |
actor_fb_likes_3 |
id |
|
|
|
|
|
|
0 |
CCH Pounder |
Joel David Moore |
Wes Studi |
1000.0 |
936.0 |
855.0 |
1 |
Johnny Depp |
Orlando Bloom |
Jack Davenport |
40000.0 |
5000.0 |
1000.0 |
2 |
Christoph Waltz |
Rory Kinnear |
Stephanie Sigman |
11000.0 |
393.0 |
161.0 |
3 |
Tom Hardy |
Christian Bale |
Joseph Gordon-Levitt |
27000.0 |
23000.0 |
23000.0 |
4 |
Doug Walker |
Rob Walker |
NaN |
131.0 |
12.0 |
NaN |
directors.head()
|
director_1 |
director_fb_likes_1 |
id |
|
|
0 |
James Cameron |
0.0 |
1 |
Gore Verbinski |
563.0 |
2 |
Sam Mendes |
0.0 |
3 |
Christopher Nolan |
22000.0 |
4 |
Doug Walker |
131.0 |
movie2 = movie_table.merge(directors.reset_index(), on='id', how='left') \
.merge(actors.reset_index(), on='id', how='left')
movie2.head()
|
id |
title |
year |
duration |
rating |
director_1 |
director_fb_likes_1 |
actor_1 |
actor_2 |
actor_3 |
actor_fb_likes_1 |
actor_fb_likes_2 |
actor_fb_likes_3 |
0 |
0 |
Avatar |
2009.0 |
178.0 |
PG-13 |
James Cameron |
0.0 |
CCH Pounder |
Joel David Moore |
Wes Studi |
1000.0 |
936.0 |
855.0 |
1 |
1 |
Pirates of the Caribbean: At World's End |
2007.0 |
169.0 |
PG-13 |
Gore Verbinski |
563.0 |
Johnny Depp |
Orlando Bloom |
Jack Davenport |
40000.0 |
5000.0 |
1000.0 |
2 |
2 |
Spectre |
2015.0 |
148.0 |
PG-13 |
Sam Mendes |
0.0 |
Christoph Waltz |
Rory Kinnear |
Stephanie Sigman |
11000.0 |
393.0 |
161.0 |
3 |
3 |
The Dark Knight Rises |
2012.0 |
164.0 |
PG-13 |
Christopher Nolan |
22000.0 |
Tom Hardy |
Christian Bale |
Joseph Gordon-Levitt |
27000.0 |
23000.0 |
23000.0 |
4 |
4 |
Star Wars: Episode VII - The Force Awakens |
NaN |
NaN |
NaN |
Doug Walker |
131.0 |
Doug Walker |
Rob Walker |
NaN |
131.0 |
12.0 |
NaN |