Pandas库基础
底层实现是numpy实现的
1.1读取csv文件
import pandas as pd
food_info=pd.read_csv(r"F:\唐宇迪机器学习资料\机器学习\Python库代码(4个)\2-数据分析处理库pandas\food_info.csv")
print(type(food_info))
#print(food_info.dtypes)
#print(help(pd.read_csv))
<class 'pandas.core.frame.DataFrame'>
其中Shrt_Desc是object类型可以看为是string类型
其中对应的数据类型如下
object–string
int–int
flota–float
datetime–time value
bool–bool
food_info.head()
NDB_No | Shrt_Desc | Water_(g) | Energ_Kcal | Protein_(g) | Lipid_Tot_(g) | Ash_(g) | Carbohydrt_(g) | Fiber_TD_(g) | Sugar_Tot_(g) | ... | Vit_A_IU | Vit_A_RAE | Vit_E_(mg) | Vit_D_mcg | Vit_D_IU | Vit_K_(mcg) | FA_Sat_(g) | FA_Mono_(g) | FA_Poly_(g) | Cholestrl_(mg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | BUTTER WITH SALT | 15.87 | 717 | 0.85 | 81.11 | 2.11 | 0.06 | 0.0 | 0.06 | ... | 2499.0 | 684.0 | 2.32 | 1.5 | 60.0 | 7.0 | 51.368 | 21.021 | 3.043 | 215.0 |
1 | 1002 | BUTTER WHIPPED WITH SALT | 15.87 | 717 | 0.85 | 81.11 | 2.11 | 0.06 | 0.0 | 0.06 | ... | 2499.0 | 684.0 | 2.32 | 1.5 | 60.0 | 7.0 | 50.489 | 23.426 | 3.012 | 219.0 |
2 | 1003 | BUTTER OIL ANHYDROUS | 0.24 | 876 | 0.28 | 99.48 | 0.00 | 0.00 | 0.0 | 0.00 | ... | 3069.0 | 840.0 | 2.80 | 1.8 | 73.0 | 8.6 | 61.924 | 28.732 | 3.694 | 256.0 |
3 | 1004 | CHEESE BLUE | 42.41 | 353 | 21.40 | 28.74 | 5.11 | 2.34 | 0.0 | 0.50 | ... | 721.0 | 198.0 | 0.25 | 0.5 | 21.0 | 2.4 | 18.669 | 7.778 | 0.800 | 75.0 |
4 | 1005 | CHEESE BRICK | 41.11 | 371 | 23.24 | 29.68 | 3.18 | 2.79 | 0.0 | 0.51 | ... | 1080.0 | 292.0 | 0.26 | 0.5 | 22.0 | 2.5 | 18.764 | 8.598 | 0.784 | 94.0 |
5 rows × 36 columns
把刚刚读取的数据部分显示以下自动显示前5条数据,如果想显示前三条,即在括号里写3
food_info.head(3)
NDB_No | Shrt_Desc | Water_(g) | Energ_Kcal | Protein_(g) | Lipid_Tot_(g) | Ash_(g) | Carbohydrt_(g) | Fiber_TD_(g) | Sugar_Tot_(g) | ... | Vit_A_IU | Vit_A_RAE | Vit_E_(mg) | Vit_D_mcg | Vit_D_IU | Vit_K_(mcg) | FA_Sat_(g) | FA_Mono_(g) | FA_Poly_(g) | Cholestrl_(mg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | BUTTER WITH SALT | 15.87 | 717 | 0.85 | 81.11 | 2.11 | 0.06 | 0.0 | 0.06 | ... | 2499.0 | 684.0 | 2.32 | 1.5 | 60.0 | 7.0 | 51.368 | 21.021 | 3.043 | 215.0 |
1 | 1002 | BUTTER WHIPPED WITH SALT | 15.87 | 717 | 0.85 | 81.11 | 2.11 | 0.06 | 0.0 | 0.06 | ... | 2499.0 | 684.0 | 2.32 | 1.5 | 60.0 | 7.0 | 50.489 | 23.426 | 3.012 | 219.0 |
2 | 1003 | BUTTER OIL ANHYDROUS | 0.24 | 876 | 0.28 | 99.48 | 0.00 | 0.00 | 0.0 | 0.00 | ... | 3069.0 | 840.0 | 2.80 | 1.8 | 73.0 | 8.6 | 61.924 | 28.732 | 3.694 | 256.0 |
3 rows × 36 columns
想要输入末尾几行则用food_info.tail()
food_info.tail(4)
NDB_No | Shrt_Desc | Water_(g) | Energ_Kcal | Protein_(g) | Lipid_Tot_(g) | Ash_(g) | Carbohydrt_(g) | Fiber_TD_(g) | Sugar_Tot_(g) | ... | Vit_A_IU | Vit_A_RAE | Vit_E_(mg) | Vit_D_mcg | Vit_D_IU | Vit_K_(mcg) | FA_Sat_(g) | FA_Mono_(g) | FA_Poly_(g) | Cholestrl_(mg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8614 | 90240 | SCALLOP (BAY&SEA) CKD STMD | 70.25 | 111 | 20.54 | 0.84 | 2.97 | 5.41 | 0.0 | 0.0 | ... | 5.0 | 2.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.218 | 0.082 | 0.222 | 41.0 |
8615 | 90480 | SYRUP CANE | 26.00 | 269 | 0.00 | 0.00 | 0.86 | 73.14 | 0.0 | 73.2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000 | 0.000 | 0.0 |
8616 | 90560 | SNAIL RAW | 79.20 | 90 | 16.10 | 1.40 | 1.30 | 2.00 | 0.0 | 0.0 | ... | 100.0 | 30.0 | 5.0 | 0.0 | 0.0 | 0.1 | 0.361 | 0.259 | 0.252 | 50.0 |
8617 | 93600 | TURTLE GREEN RAW | 78.50 | 89 | 19.80 | 0.50 | 1.20 | 0.00 | 0.0 | 0.0 | ... | 100.0 | 30.0 | 0.5 | 0.0 | 0.0 | 0.1 | 0.127 | 0.088 | 0.170 | 50.0 |
4 rows × 36 columns
food_info.columns#输出列名
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)'],
dtype='object')
food_info.shape#输出维度
(8618, 36)
1.2运用切片取数据
food_info.loc[3:6]#与列表等切片用法一致
NDB_No | Shrt_Desc | Water_(g) | Energ_Kcal | Protein_(g) | Lipid_Tot_(g) | Ash_(g) | Carbohydrt_(g) | Fiber_TD_(g) | Sugar_Tot_(g) | ... | Vit_A_IU | Vit_A_RAE | Vit_E_(mg) | Vit_D_mcg | Vit_D_IU | Vit_K_(mcg) | FA_Sat_(g) | FA_Mono_(g) | FA_Poly_(g) | Cholestrl_(mg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 1004 | CHEESE BLUE | 42.41 | 353 | 21.40 | 28.74 | 5.11 | 2.34 | 0.0 | 0.50 | ... | 721.0 | 198.0 | 0.25 | 0.5 | 21.0 | 2.4 | 18.669 | 7.778 | 0.800 | 75.0 |
4 | 1005 | CHEESE BRICK | 41.11 | 371 | 23.24 | 29.68 | 3.18 | 2.79 | 0.0 | 0.51 | ... | 1080.0 | 292.0 | 0.26 | 0.5 | 22.0 | 2.5 | 18.764 | 8.598 | 0.784 | 94.0 |
5 | 1006 | CHEESE BRIE | 48.42 | 334 | 20.75 | 27.68 | 2.70 | 0.45 | 0.0 | 0.45 | ... | 592.0 | 174.0 | 0.24 | 0.5 | 20.0 | 2.3 | 17.410 | 8.013 | 0.826 | 100.0 |
6 | 1007 | CHEESE CAMEMBERT | 51.80 | 300 | 19.80 | 24.26 | 3.68 | 0.46 | 0.0 | 0.46 | ... | 820.0 | 241.0 | 0.21 | 0.4 | 18.0 | 2.0 | 15.259 | 7.023 | 0.724 | 72.0 |
4 rows × 36 columns
现在要用列名来取数据,第一行为列名
ndb=food_info['NDB_No']
print(ndb)
0 1001
1 1002
2 1003
3 1004
4 1005
...
8613 83110
8614 90240
8615 90480
8616 90560
8617 93600
Name: NDB_No, Length: 8618, dtype: int64
如果想要取多个列将这些列表组成一个list传入即可
1.3进行数学运算
print(food_info["Iron_(mg)"])
div_1000=food_info["Iron_(mg)"]/1000
print(div_1000)#进行对应的每个元素操作
0 0.02
1 0.16
2 0.00
3 0.31
4 0.43
...
8613 1.40
8614 0.58
8615 3.60
8616 3.50
8617 1.40
Name: Iron_(mg), Length: 8618, dtype: float64
0 0.00002
1 0.00016
2 0.00000
3 0.00031
4 0.00043
...
8613 0.00140
8614 0.00058
8615 0.00360
8616 0.00350
8617 0.00140
Name: Iron_(mg), Length: 8618, dtype: float64
water_energy=food_info["Water_(g)"]*food_info["Energ_Kcal"]
#对应的列与列进行运算
iron_grams=food_info["Iron_(mg)"]/1000
print(food_info.shape)
food_info["Iron_(g)"]=iron_grams#新加一列
print(food_info.shape)
(8618, 36)
(8618, 37)
对特定的列求最值.max(),.mean(),.min()
import pandas as pd
titanic_train=pd.read_csv(r'F:\唐宇迪机器学习资料\机器学习\Python库代码(4个)\2-数据分析处理库pandas\titanic_train.csv')
age=titanic_train["Age"]
print(age)
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
...
886 27.0
887 19.0
888 NaN
889 26.0
890 32.0
Name: Age, Length: 891, dtype: float64
print(pd.isnull(age))#判断是否为缺失值
len(pd.isnull(age))
0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
891
#这里得True与flase可以当一个索引
titanic_train['Age'].mean()#这里默认情况下是不计算nan值得
29.69911764705882
分组求和/分类别求和
import numpy as np
titanic_survival=titanic_train["Survived"]
#算每个类别对应得平均人数
passenger_survival=titanic_train.pivot_table(index='Pclass',values="Survived",aggfunc=np.mean)
print(passenger_survival)
#aggfunc不设置默认是求均值
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
对于每一个Pclass对应得有获救率
port_stats=titanic_train.pivot_table(index="Embarked",values=['Fare','Survived'],aggfunc=[np.sum])
print(port_stats)#可以进行分组求和求均值等等
sum
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
#去除缺失值得行或列
drop_na_colnums=titanic_train.dropna(axis=1)
print(drop_na_colnums.shape)
(891, 9)