Python之Pandas（2）

import numpy as np
import pandas as pd
In [36]:

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns = ['a','b','c','d'])
#列索引
print(df['a'])
print(df['b'])
print(df['c'])
print(df[['a','c']])

#行索引 .loc[]
print(df.loc['one'])
print(df.loc[['one','two']])
one      74.508548
two      33.914635
three    89.037458
Name: a, dtype: float64
one      34.484117
two      36.413632
three    15.947303
Name: b, dtype: float64
one      18.774667
two      93.554980
three     6.831067
Name: c, dtype: float64
               a          c
one    74.508548  18.774667
two    33.914635  93.554980
three  89.037458   6.831067
a    74.508548
b    34.484117
c    18.774667
d    31.343584
Name: one, dtype: float64
             a          b          c          d
one  74.508548  34.484117  18.774667  31.343584
two  33.914635  36.413632  93.554980  31.626610
In [27]:

#切片
'''
iloc主要使用数字来索引数据，而不能使用字符型的标签来索引数据。而loc则刚好相反，只能使用字符型标签来索引数据，不能使用数字来索引数据，不过有特殊情况，当数据框dataframe的行标签或者列标签为数字，loc就可以来其来索引。
'''
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns = ['a','b','c','d'])

df.loc['one':'three']


Out[27]:
a	b	c	d
one	95.553289	88.567488	5.964061	46.211595
two	34.463585	75.759024	59.007925	36.239487
three	39.914204	30.086838	20.285195	50.854264
In [30]:

#判断切片
df[df < 30]
Out[30]:
a	b	c	d
one	NaN	NaN	5.964061	NaN
two	NaN	NaN	NaN	NaN
three	NaN	NaN	20.285195	NaN
In [35]:

df[df[['a','b']] > 40]
Out[35]:
a	b	c	d
one	95.553289	88.567488	NaN	NaN
two	NaN	75.759024	NaN	NaN
three	NaN	NaN	NaN	NaN
In [37]:

#先索引行，再索引列
print(df['a'].loc[['one','three']])
one      74.508548
three    89.037458
Name: a, dtype: float64

import numpy as np
import pandas as pd
In [6]:

#转置  添加   修改   删除   对齐    排序
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns=['a','b'])

#查看前五条
print(df.head())
#查看后五条
print(df.tail())

#转置
print(df.T)
           a          b
0  87.848371  52.160364
1  56.494315  40.510311
2   4.676018  87.834623
3  50.521130  48.060117
4  79.608305   0.083539
           a          b
3  50.521130  48.060117
4  79.608305   0.083539
5  27.151838  77.630977
6  31.472300  76.632293
7   6.982855  56.655203
           0          1          2          3          4          5  \
a  87.848371  56.494315   4.676018  50.521130  79.608305  27.151838   
b  52.160364  40.510311  87.834623  48.060117   0.083539  77.630977   

           6          7  
a  31.472300   6.982855  
b  76.632293  56.655203  
In [20]:

#添加 修改
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])
df['e'] = 10
df.loc[4] = 20
df[['a','c']] = 30

#删除
del df['a']
df.drop('b',1)
Out[20]:
c	d	e
0	30	13.046410	10
1	30	61.349300	10
2	30	19.733048	10
3	30	6.011787	10
4	30	20.000000	20
In [29]:

#对齐
df = pd.DataFrame(np.random.rand(16).reshape(4,4),columns=['a','b','c','d'])
df1 = pd.DataFrame(np.random.rand(20).reshape(4,5),columns=['a','b','c','d','e'])

print(df-df1)
print(df+df1)
          a         b         c         d   e
0 -0.305116  0.454676  0.026236 -0.059819 NaN
1 -0.381073 -0.171703 -0.024976 -0.082708 NaN
2  0.143118 -0.043733  0.013950  0.828276 NaN
3  0.359673 -0.584039 -0.179544 -0.793191 NaN
          a         b         c         d   e
0  1.630681  1.343964  1.505061  0.488630 NaN
1  0.800414  1.428047  1.450315  0.714401 NaN
2  1.111159  0.115849  0.243141  0.830235 NaN
3  0.797655  1.219097  1.003082  0.949165 NaN
In [58]:

#排序
#按值   ascending 升序 降序
print(df1.sort_values(['a'],ascending=1))
print(df1.sort_values(by = 'a',ascending=0))
          a         b         c         d         e
3  0.218991  0.901568  0.591313  0.871178  0.587248
2  0.484020  0.079791  0.114596  0.000979  0.875343
1  0.590744  0.799875  0.737645  0.398555  0.740345
0  0.967899  0.444644  0.739412  0.274224  0.782209
          a         b         c         d         e
0  0.967899  0.444644  0.739412  0.274224  0.782209
1  0.590744  0.799875  0.737645  0.398555  0.740345
2  0.484020  0.079791  0.114596  0.000979  0.875343
3  0.218991  0.901568  0.591313  0.871178  0.587248
In [62]:

#d多列排序，先按照a=1排完，再按照c去排
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
                    'b':list(range(8)),
                    'c':list(range(8,0,-1))})
df2.sort_values(['a','c'])
Out[62]:
a	b	c
3	1	3	5
2	1	2	6
1	1	1	7
0	1	0	8
7	2	7	1
6	2	6	2
5	2	5	3
4	2	4	4
In [38]:

#安找行的索引进行排序   ascending = False/True
df1.sort_index(ascending=True)
Out[38]:
a	b	c	d	e
0	0.967899	0.444644	0.739412	0.274224	0.782209
1	0.590744	0.799875	0.737645	0.398555	0.740345
2	0.484020	0.079791	0.114596	0.000979	0.875343
3	0.218991	0.901568	0.591313	0.871178	0.587248
In [64]:

#inplace 改变本身
df1.sort_index(inplace=1)
df1
Out[64]:
a	b	c	d	e
0	0.967899	0.444644	0.739412	0.274224	0.782209
1	0.590744	0.799875	0.737645	0.398555	0.740345
2	0.484020	0.079791	0.114596	0.000979	0.875343
3	0.218991	0.901568	0.591313	0.871178	0.587248
In [68]:

#小作业
df = pd.DataFrame(np.random.randn(3,3)*100,columns=['v1','v2','v3'],index=['a','b','c'])
df.sort_values(by = 'v2',ascending=1)
Out[68]:
v1	v2	v3
c	105.913696	-179.710097	120.055654
b	-36.379390	-156.621727	-25.782565
a	-67.891968	-41.681334	-30.058927
In [76]:

df = pd.DataFrame(np.random.randn(5,2)*100,columns=['v1','v2'],index=['a','b','c','d','e'])
df.T
Out[76]:
a	b	c	d	e
v1	-56.708755	147.197783	91.484936	-41.872528	-17.752307
v2	-4.279827	33.372191	-27.462034	-103.270048	-132.076251
In [81]:

df = pd.Series(np.arange(10),index=['a','b','c','d','e','f','g','h','i','j'])
df.loc['a','b','c'] = 100
df
Out[81]:
a    100
b    100
c    100
d      3
e      4
f      5
g      6
h      7
i      8
j      9
dtype: int32
In [84]:

df = pd.Series(np.arange(5),index=['a','b','c','d','e'])
df1 = pd.Series(np.arange(5),index=['c','d','e','f','g'])
df + df1
Out[84]:
a    NaN
b    NaN
c    2.0
d    4.0
e    6.0
f    NaN
g    NaN
dtype: float64

猜你喜欢