运行环境 jupyter notebook
import matplotlib.pyplot as plt
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
1. 导入数据并查看
data01 = pd.read_csv("datasets/US_Baby_Names_right.csv")
data01.head(5)
|
Unnamed: 0 |
Id |
Name |
Year |
Gender |
State |
Count |
0 |
11349 |
11350 |
Emma |
2004 |
F |
AK |
62 |
1 |
11350 |
11351 |
Madison |
2004 |
F |
AK |
48 |
2 |
11351 |
11352 |
Hannah |
2004 |
F |
AK |
46 |
3 |
11352 |
11353 |
Grace |
2004 |
F |
AK |
44 |
4 |
11353 |
11354 |
Emily |
2004 |
F |
AK |
41 |
2. 删除 Unname:0列 和 Id列(两种方法)
data02 = data01.drop(labels=['Unnamed: 0','Id'],inplace=False,axis=1)
data02.head(5)
del data01['Unnamed: 0']
del data01['Id']
|
Name |
Year |
Gender |
State |
Count |
0 |
Emma |
2004 |
F |
AK |
62 |
1 |
Madison |
2004 |
F |
AK |
48 |
2 |
Hannah |
2004 |
F |
AK |
46 |
3 |
Grace |
2004 |
F |
AK |
44 |
4 |
Emily |
2004 |
F |
AK |
41 |
3 判断出数据集中 男孩名字、女孩名字谁多
Gender_data = data01["Gender"].value_counts()
Gender_data
输出:
F 558846
M 457549
Name: Gender, dtype: int64
a = Gender_data['F']
b = Gender_data['M']
if a>b:
print("女孩数量大于男孩")
else:
print("男孩数量大于女孩")
输出:
女孩数量大于男孩
4 按照 Name 字段将数据集进行分组 并求和赋值给变量 names
names = data01.groupby('Name').sum()
names.head()
输出:
|
Year |
Count |
Name |
|
|
Aaban |
4027 |
12 |
Aadan |
8039 |
23 |
Aadarsh |
2009 |
5 |
Aaden |
393963 |
3426 |
Aadhav |
2014 |
6 |
5 按照每个名字被使用的次数(Count)对上步中结果进行降序排序
6 在数据集中,共出现了多少个名字?(不包含重复项,至少使用两种方法)
data03 = data01.drop_duplicates(['Name'])
data03.head()
|
Unnamed: 0 |
Id |
Name |
Year |
Gender |
State |
Count |
gender_M |
0 |
11349 |
11350 |
Emma |
2004 |
F |
AK |
62 |
0 |
1 |
11350 |
11351 |
Madison |
2004 |
F |
AK |
48 |
0 |
2 |
11351 |
11352 |
Hannah |
2004 |
F |
AK |
46 |
0 |
3 |
11352 |
11353 |
Grace |
2004 |
F |
AK |
44 |
0 |
4 |
11353 |
11354 |
Emily |
2004 |
F |
AK |
41 |
0 |