pandas
start
pandas_serieas
import numpy as np
from pandas import Series,DataFrame
声明一个Series类型的数据
sr = Series(np.random.normal(175,size=10),index=list('abcdefghij'))
sr
- a 177.955666
b 175.659703
c 174.452552
d 175.730355
e 173.443535
f 175.653806
g 174.199460
h 174.011293
i 175.902427
j 176.296591
dtype: float64
sr['a']
sr[1]
sr.loc['a']
sr.iloc[1]
sr[['a','b']]
sr[[1,2]]
sr.loc[['a','b']]
sr.iloc[[1,2]]
pandas_Dataframe
声明与定义
df = DataFrame(data = np.random.randint(0,150,size=(10,3)),index=list('ABCDEFGHIJ'),columns=['Python','Math','Chinese'])
# 列是属性,行是样本,
df2 = DataFrame(data={
'python':np.random.randint(0,150,size = 10),
'xiaoming':np.random.randint(0,150,size = 10),
'xiaohong':np.random.randint(0,150,size = 10),
})
df3 = DataFrame(data={
'python':np.random.randint(0,150,size = 10),
'xiaoming':np.random.randint(0,150,size = 10),
'xiaohong':np.random.randint(0,150,size = 10),
},
index = list('xiaobingab')
)
df4 = DataFrame(data={
'python':np.random.randint(0,150,size = 10),
'xiaoming':np.random.randint(0,150,size = 10),
'xiaohong':np.random.randint(0,150,size = 10),
},
index = list('xiaobingab'),
columns = ['A', 'xiaoming', 'C', 'D']
)
display(df2,df3,df4) # 行索引不指定,则用自然数填充
查询
display(type(df2['python']))
df2['python'] # pandas.core.series.Series
df2[['python']] # 仍然是DataFrame
df2[['python','xiaoming']] # 相当于定向的切片
df2.python
df2.loc[1] # 这里的1是index索引名
df2.iloc[1] # 这里的1是索引顺序为1(从0开始)
df4 = df2.transpose() # 交换index 与 column (转置)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
----- | ---- | ---- | ---- | ---- | ---- | --- | ---- | ---- | ---- | ----
python | 17 | 110 | 115| 79| 52| 121| 137| 41| 29 |37
xiaoming |41| 104 |48 |96| 108| 85| 148| 42| 40 |1
xiaohong |128 |48| 0| 96 |5 |115| 23| 64| 51| 66
df4[2]['python'] # 115
df4.loc['python'][1] # 110
df4.loc['python',1] # 110
df4.iloc[2,1] # 48
data = df4.values
data
- array([[ 17, 110, 115, 79, 52, 121, 137, 41, 29, 37],
[ 41, 104, 48, 96, 108, 85, 148, 42, 40, 1],
[128, 48, 0, 96, 5, 115, 23, 64, 51, 66]])
切片
df5 = df4.iloc[:,2:]
df5 = df4.loc[:,0:5]
df4['python':'xiaoming'] # *** 行切片
df4[1:2]
df1 = DataFrame(np.random.normal(100,scale=20,size=(6,3)),index = list('zxcvnm'),columns=['qq','wechat','email'])
df1
- qq wechat email
z 92.386231 88.978414 115.526063
x 110.847067 105.462850 131.447248
c 88.559184 75.191029 113.287049
v 82.812931 105.350996 118.224760
n 106.242152 114.609146 124.179372
m 100.107778 85.918584 74.234945
df2 = df.add(df1,fill_value=0)
df2
- Chinese Math Python email qq wechat
A 84.0 130.0 95.0 NaN NaN NaN
B 39.0 69.0 149.0 NaN NaN NaN
C 71.0 16.0 146.0 NaN NaN NaN
D 89.0 131.0 7.0 NaN NaN NaN
E 57.0 30.0 1.0 NaN NaN NaN
F 89.0 85.0 55.0 NaN NaN NaN
G 22.0 134.0 99.0 NaN NaN NaN
H 20.0 40.0 28.0 NaN NaN NaN
I 90.0 95.0 106.0 NaN NaN NaN
J 29.0 54.0 106.0 NaN NaN NaN
c NaN NaN NaN 113.287049 88.559184 75.191029
m NaN NaN NaN 74.234945 100.107778 85.918584
n NaN NaN NaN 124.179372 106.242152 114.609146
v NaN NaN NaN 118.224760 82.812931 105.350996
x NaN NaN NaN 131.447248 110.847067 105.462850
z NaN NaN NaN 115.526063 92.386231 88.978414
df2['A':'E'] = df2.loc['A':'E']/2 +25
Python Operator | Pandas Method(s) |
---|---|
+ | add() |
- | sub(), subtract() |
* | mul(), multiply() |
/ | truediv(), div(), divide() |
// | floordiv() |
% | mod() |
** | pow() |
NaN
type(np.NaN) # float
a = np.array([1,3,np.NaN,np.NaN,6])
a
- array([ 1., 3., nan, nan, 6.])
np.sum(a) # nan
np.nansum(a) # 10 此方法会忽略nan 值
Dataframe中的nan
df = DataFrame(np.random.normal(100,scale=20,size=(10,3)),index=list('ABCDEFGHIJ'),columns=['Python','Math','Eng'])
df
- Python Math Eng
A 107.949245 113.838169 83.597526
B 87.672478 110.768434 70.705457
C 115.130985 117.322252 96.552291
D 124.733534 104.409723 67.797502
E 69.930626 113.678532 107.663599
F 96.381046 92.938791 98.805056
G 95.943738 115.415718 131.666832
H 83.262271 112.149921 98.384134
I 129.727339 102.010093 107.216974
J 101.591134 106.435605 88.042077
df['Python']['F'] = np.NaN
df.iloc[8,2] = np.NaN
df.loc['C','Math'] = np.NaN
df.isnull().any()
- Python True
Math True
Eng True
dtype: bool
df.isnull().any(axis = 1) # 判断哪一行有空数据
- A False
B False
C True
D False
E False
F True
G False
H False
I True
J False
dtype: bool
cond = df.notnull().all(axis=1)
df[cond]
- 所有不含空的index
- Python Math Eng
A 107.949245 113.838169 83.597526
B 87.672478 110.768434 70.705457
D 124.733534 104.409723 67.797502
E 69.930626 113.678532 107.663599
G 95.943738 115.415718 131.666832
H 83.262271 112.149921 98.384134
J 101.591134 106.435605 88.042077
pandas中None与np.nan都视作np.nan
fillna() 与 dropna()
df.dropna() # 过滤丢失数据的样本
- Python Math Eng
A 107.949245 113.838169 83.597526
B 87.672478 110.768434 70.705457
D 124.733534 104.409723 67.797502
E 69.930626 113.678532 107.663599
G 95.943738 115.415718 131.666832
H 83.262271 112.149921 98.384134
J 101.591134 106.435605 88.042077
df.fillna(value=df.mean()) # 填充平均值
- Python | Math | Eng
A 107.949245 113.838169 83.597526
B 87.672478 110.768434 70.705457
C 115.130985 107.960554 96.552291
D 124.733534 104.409723 67.797502
E 69.930626 113.678532 107.663599
F 101.771261 92.938791 98.805056
G 95.943738 115.415718 131.666832
H 83.262271 112.149921 98.384134
I 129.727339 102.010093 93.690497
J 101.591134 106.435605 88.042077
当然也可以添充中位数 等等
df.median() # 中位数
Python 101.6
Math 110.8
Eng 96.6
dtype: float64