1.merge用来连接两个DataFrame对象,参数on选择基于哪个列进行连接,how选择内连接,左连接还是全连接的方式。merge操作类似于SQL中的join操作。
- import pandas as pd
- df1 = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
- df2 = pd.DataFrame({'key':['a','b','a','b','d'],'data2':range(5)})
- print df1
- print df2
- # data1 key
- # 0 0 b
- # 1 1 b
- # 2 2 a
- # 3 3 c
- # 4 4 a
- # 5 5 b
- # data2 key
- # 0 0 a
- # 1 1 b
- # 2 2 a
- # 3 3 b
- # 4 4 d
- df3 = pd.merge(df1,df2,on = 'key',how = 'left')
- print df3
- # data1 key data2
- # 0 0 b 1.0
- # 1 0 b 3.0
- # 2 1 b 1.0
- # 3 1 b 3.0
- # 4 2 a 0.0
- # 5 2 a 2.0
- # 6 3 c NaN
- # 7 4 a 0.0
- # 8 4 a 2.0
- # 9 5 b 1.0
- # 10 5 b 3.0
2.对于大数据,很可能要对于其中的部分列进行聚合,这里使用groupby
- import numpy as np
- import pandas as pd
- df = pd.DataFrame({'key1':['a','a','b','b','a'],
- 'key2':['one','two','one','two','one'],
- 'data1':np.random.randn(5),
- 'data2':np.random.randn(5)})
- print df
- # data1 data2 key1 key2
- # 0 0.336673 0.540336 a one
- # 1 -0.821839 -1.348654 a two
- # 2 1.066305 0.230884 b one
- # 3 0.788950 -0.540482 b two
- # 4 -0.872019 -0.813607 a one
- df1 = df[['data1','data2']].groupby(df['key1']).sum()
- print df1
- # data1 data2
- # key1
- # a 0.827444 1.512656
- # b -0.060730 -0.461793
- print df.groupby('key1')[['data1','data2']].sum() #简便写法
- # data1 data2
- # key1
- # a 0.827444 1.512656
- # b -0.060730 -0.461793
- import numpy as np
- import pandas as pd
- df = pd.DataFrame({'key1':['a','a','b','b','a'],
- 'key2':['one','two','one','two','one'],
- 'data1':np.random.randn(5),
- 'data2':np.random.randn(5)})
- print df
- # data1 data2 key1 key2
- # 0 0.336673 0.540336 a one
- # 1 -0.821839 -1.348654 a two
- # 2 1.066305 0.230884 b one
- # 3 0.788950 -0.540482 b two
- # 4 -0.872019 -0.813607 a one
- mapping = {0:'one',1:'one',2:'two',3:'two',4:'two'}
- print df.groupby(mapping,axis = 0).sum()
- # data1 data2
- # one 2.044990 0.916197
- # two 2.310946 -2.240196