import numpy as np
import pandas as pd
In [36]:
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns = ['a','b','c','d'])
#列索引
print(df['a'])
print(df['b'])
print(df['c'])
print(df[['a','c']])
#行索引 .loc[]
print(df.loc['one'])
print(df.loc[['one','two']])
one 74.508548
two 33.914635
three 89.037458
Name: a, dtype: float64
one 34.484117
two 36.413632
three 15.947303
Name: b, dtype: float64
one 18.774667
two 93.554980
three 6.831067
Name: c, dtype: float64
a c
one 74.508548 18.774667
two 33.914635 93.554980
three 89.037458 6.831067
a 74.508548
b 34.484117
c 18.774667
d 31.343584
Name: one, dtype: float64
a b c d
one 74.508548 34.484117 18.774667 31.343584
two 33.914635 36.413632 93.554980 31.626610
In [27]:
#切片
'''
iloc主要使用数字来索引数据,而不能使用字符型的标签来索引数据。而loc则刚好相反,只能使用字符型标签来索引数据,不能使用数字来索引数据,不过有特殊情况,当数据框dataframe的行标签或者列标签为数字,loc就可以来其来索引。
'''
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns = ['a','b','c','d'])
df.loc['one':'three']
Out[27]:
a b c d
one 95.553289 88.567488 5.964061 46.211595
two 34.463585 75.759024 59.007925 36.239487
three 39.914204 30.086838 20.285195 50.854264
In [30]:
#判断切片
df[df < 30]
Out[30]:
a b c d
one NaN NaN 5.964061 NaN
two NaN NaN NaN NaN
three NaN NaN 20.285195 NaN
In [35]:
df[df[['a','b']] > 40]
Out[35]:
a b c d
one 95.553289 88.567488 NaN NaN
two NaN 75.759024 NaN NaN
three NaN NaN NaN NaN
In [37]:
#先索引行,再索引列
print(df['a'].loc[['one','three']])
one 74.508548
three 89.037458
Name: a, dtype: float64
import numpy as np
import pandas as pd
In [6]:
#转置 添加 修改 删除 对齐 排序
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns=['a','b'])
#查看前五条
print(df.head())
#查看后五条
print(df.tail())
#转置
print(df.T)
a b
0 87.848371 52.160364
1 56.494315 40.510311
2 4.676018 87.834623
3 50.521130 48.060117
4 79.608305 0.083539
a b
3 50.521130 48.060117
4 79.608305 0.083539
5 27.151838 77.630977
6 31.472300 76.632293
7 6.982855 56.655203
0 1 2 3 4 5 \
a 87.848371 56.494315 4.676018 50.521130 79.608305 27.151838
b 52.160364 40.510311 87.834623 48.060117 0.083539 77.630977
6 7
a 31.472300 6.982855
b 76.632293 56.655203
In [20]:
#添加 修改
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])
df['e'] = 10
df.loc[4] = 20
df[['a','c']] = 30
#删除
del df['a']
df.drop('b',1)
Out[20]:
c d e
0 30 13.046410 10
1 30 61.349300 10
2 30 19.733048 10
3 30 6.011787 10
4 30 20.000000 20
In [29]:
#对齐
df = pd.DataFrame(np.random.rand(16).reshape(4,4),columns=['a','b','c','d'])
df1 = pd.DataFrame(np.random.rand(20).reshape(4,5),columns=['a','b','c','d','e'])
print(df-df1)
print(df+df1)
a b c d e
0 -0.305116 0.454676 0.026236 -0.059819 NaN
1 -0.381073 -0.171703 -0.024976 -0.082708 NaN
2 0.143118 -0.043733 0.013950 0.828276 NaN
3 0.359673 -0.584039 -0.179544 -0.793191 NaN
a b c d e
0 1.630681 1.343964 1.505061 0.488630 NaN
1 0.800414 1.428047 1.450315 0.714401 NaN
2 1.111159 0.115849 0.243141 0.830235 NaN
3 0.797655 1.219097 1.003082 0.949165 NaN
In [58]:
#排序
#按值 ascending 升序 降序
print(df1.sort_values(['a'],ascending=1))
print(df1.sort_values(by = 'a',ascending=0))
a b c d e
3 0.218991 0.901568 0.591313 0.871178 0.587248
2 0.484020 0.079791 0.114596 0.000979 0.875343
1 0.590744 0.799875 0.737645 0.398555 0.740345
0 0.967899 0.444644 0.739412 0.274224 0.782209
a b c d e
0 0.967899 0.444644 0.739412 0.274224 0.782209
1 0.590744 0.799875 0.737645 0.398555 0.740345
2 0.484020 0.079791 0.114596 0.000979 0.875343
3 0.218991 0.901568 0.591313 0.871178 0.587248
In [62]:
#d多列排序,先按照a=1排完,再按照c去排
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
'b':list(range(8)),
'c':list(range(8,0,-1))})
df2.sort_values(['a','c'])
Out[62]:
a b c
3 1 3 5
2 1 2 6
1 1 1 7
0 1 0 8
7 2 7 1
6 2 6 2
5 2 5 3
4 2 4 4
In [38]:
#安找行的索引进行排序 ascending = False/True
df1.sort_index(ascending=True)
Out[38]:
a b c d e
0 0.967899 0.444644 0.739412 0.274224 0.782209
1 0.590744 0.799875 0.737645 0.398555 0.740345
2 0.484020 0.079791 0.114596 0.000979 0.875343
3 0.218991 0.901568 0.591313 0.871178 0.587248
In [64]:
#inplace 改变本身
df1.sort_index(inplace=1)
df1
Out[64]:
a b c d e
0 0.967899 0.444644 0.739412 0.274224 0.782209
1 0.590744 0.799875 0.737645 0.398555 0.740345
2 0.484020 0.079791 0.114596 0.000979 0.875343
3 0.218991 0.901568 0.591313 0.871178 0.587248
In [68]:
#小作业
df = pd.DataFrame(np.random.randn(3,3)*100,columns=['v1','v2','v3'],index=['a','b','c'])
df.sort_values(by = 'v2',ascending=1)
Out[68]:
v1 v2 v3
c 105.913696 -179.710097 120.055654
b -36.379390 -156.621727 -25.782565
a -67.891968 -41.681334 -30.058927
In [76]:
df = pd.DataFrame(np.random.randn(5,2)*100,columns=['v1','v2'],index=['a','b','c','d','e'])
df.T
Out[76]:
a b c d e
v1 -56.708755 147.197783 91.484936 -41.872528 -17.752307
v2 -4.279827 33.372191 -27.462034 -103.270048 -132.076251
In [81]:
df = pd.Series(np.arange(10),index=['a','b','c','d','e','f','g','h','i','j'])
df.loc['a','b','c'] = 100
df
Out[81]:
a 100
b 100
c 100
d 3
e 4
f 5
g 6
h 7
i 8
j 9
dtype: int32
In [84]:
df = pd.Series(np.arange(5),index=['a','b','c','d','e'])
df1 = pd.Series(np.arange(5),index=['c','d','e','f','g'])
df + df1
Out[84]:
a NaN
b NaN
c 2.0
d 4.0
e 6.0
f NaN
g NaN
dtype: float64