split->apply->combine
1 2
import numpy as npimport pandas as pd
1 2 3 4 5
df = pd.DataFrame({'key1' : ['a' , 'a' , 'b' , 'b' , 'a' ], 'key2' : ['one' , 'two' , 'one' , 'two' , 'one' ], 'data1' : np.random.randn(5 ), 'data2' : np.random.randn(5 )}) df
key1
key2
data1
data2
0
a
one
1.587125
-0.517650
1
a
two
0.206854
1.503013
2
b
one
1.074688
-1.310088
3
b
two
0.306591
2.236456
4
a
one
0.462624
0.643336
1 2 3
grouped = df['data1' ].groupby(df['key1' ]) grouped
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x10637ba20>
key1
a 0.752201
b 0.690639
Name: data1, dtype: float64
count
mean
std
min
25%
50%
75%
max
key1
a
3.0
0.752201
0.734288
0.206854
0.334739
0.462624
1.024874
1.587125
b
2.0
0.690639
0.543127
0.306591
0.498615
0.690639
0.882664
1.074688
1 2
means = df['data1' ].groupby([df['key1' ],df['key2' ]]).mean() means
key1 key2
a one 1.024874
two 0.206854
b one 1.074688
two 0.306591
Name: data1, dtype: float64
key2
one
two
key1
a
1.024874
0.206854
b
1.074688
0.306591
1 2 3
states = np.array(['Ohio' , 'California' , 'California' , 'Ohio' , 'Ohio' ]) years = np.array([2005 , 2005 , 2006 , 2005 , 2006 ]) df['data1' ].groupby([states, years]).mean()
California 2005 0.206854
2006 1.074688
Ohio 2005 0.946858
2006 0.462624
Name: data1, dtype: float64
1
df.groupby('key1' ).mean()
data1
data2
key1
a
0.752201
0.542900
b
0.690639
0.463184
1
df.groupby(['key1' ,'key2' ]).mean()
data1
data2
key1
key2
a
one
1.024874
0.062843
two
0.206854
1.503013
b
one
1.074688
-1.310088
two
0.306591
2.236456
对分组进行迭代
1 2 3 4 5
for name,group in df.groupby('key1' ): print(name) print(group)
a
key1 key2 data1 data2
0 a one 1.587125 -0.517650
1 a two 0.206854 1.503013
4 a one 0.462624 0.643336
b
key1 key2 data1 data2
2 b one 1.074688 -1.310088
3 b two 0.306591 2.236456
1 2 3 4 5
for name,group in df.groupby(['key1' ,'key2' ]): print(name) print(group)
('a', 'one')
key1 key2 data1 data2
0 a one 1.587125 -0.517650
4 a one 0.462624 0.643336
('a', 'two')
key1 key2 data1 data2
1 a two 0.206854 1.503013
('b', 'one')
key1 key2 data1 data2
2 b one 1.074688 -1.310088
('b', 'two')
key1 key2 data1 data2
3 b two 0.306591 2.236456
1 2 3
pieces = dict(list(df.groupby('key1' ))) pieces['b' ]
key1
key2
data1
data2
2
b
one
1.074688
-1.310088
3
b
two
0.306591
2.236456
选取一个或者一列
1
df['data1' ].groupby(df['key1' ]).describe()
count
mean
std
min
25%
50%
75%
max
key1
a
3.0
0.752201
0.734288
0.206854
0.334739
0.462624
1.024874
1.587125
b
2.0
0.690639
0.543127
0.306591
0.498615
0.690639
0.882664
1.074688
1
df.groupby(['key1' ])['data1' ].describe()
count
mean
std
min
25%
50%
75%
max
key1
a
3.0
0.752201
0.734288
0.206854
0.334739
0.462624
1.024874
1.587125
b
2.0
0.690639
0.543127
0.306591
0.498615
0.690639
0.882664
1.074688
通过字典或Series进行分组
1 2 3 4 5
people = pd.DataFrame(np.random.randn(5 , 5 ), columns=['a' , 'b' , 'c' , 'd' , 'e' ], index=['Joe' , 'Steve' , 'Wes' , 'Jim' , 'Travis' ]) people.iloc[2 :3 , [1 , 2 ]] = np.nan people
a
b
c
d
e
Joe
-0.323368
-0.768278
-0.169238
0.308583
-0.459496
Steve
-0.597247
-0.301618
-0.439395
-1.482591
-0.019049
Wes
0.100330
NaN
NaN
0.457292
1.192268
Jim
-0.223226
-0.196377
2.264969
-0.982435
-1.851675
Travis
-1.245037
-0.080968
-0.509985
0.928633
-0.474299
1 2
mapping = {'a' : 'red' , 'b' : 'red' , 'c' : 'blue' , 'd' : 'blue' , 'e' : 'red' , 'f' : 'orange' }
1 2
by_columns = people.groupby(mapping,axis=1 ) by_columns.sum()
blue
red
Joe
0.139345
-1.551143
Steve
-1.921986
-0.917914
Wes
0.457292
1.292598
Jim
1.282534
-2.271279
Travis
0.418648
-1.800303
通过函数进行分组
1
people.groupby(len).sum()
a
b
c
d
e
3
-0.446265
-0.964655
2.095731
-0.216560
-1.118904
5
-0.597247
-0.301618
-0.439395
-1.482591
-0.019049
6
-1.245037
-0.080968
-0.509985
0.928633
-0.474299
根据索引级别分组
1 2 3 4 5
columns = pd.MultiIndex.from_arrays([['US' , 'US' , 'US' , 'JP' , 'JP' ], [1 , 3 , 5 , 1 , 3 ]], names=['cty' , 'tenor' ]) hier_df = pd.DataFrame(np.random.randn(4 , 5 ), columns=columns) hier_df
cty
US
JP
tenor
1
3
5
1
3
0
-0.297194
0.028656
1.206242
1.330768
-0.267134
1
0.104323
-1.416412
-0.602244
-0.442782
-0.654808
2
1.199846
1.050484
0.077949
-0.214932
1.186233
3
0.214656
0.337322
-0.324943
1.581862
0.821636
数据聚合
主要讲sum(),count()等函数计算时到底发生了什么,我们也可以使用自己定义的聚合函数。
1 2
grouped = df.groupby('key1' ) grouped['data1' ].quantile(0.9 )
key1
a 1.362225
b 0.997878
Name: data1, dtype: float64
1
grouped.describe().stack().head()
data1
data2
key1
a
count
3.000000
3.000000
mean
0.752201
0.542900
std
0.734288
1.014069
min
0.206854
-0.517650
25%
0.334739
0.062843
分组运算和转换
apply:一般性的‘拆分-应用-合并’
最一般化的GroupBy方法
1 2 3
tips = pd.read_csv('examples/tips.csv' ) tips['tip_pct' ] = tips['tip' ] / tips['total_bill' ] tips[:6 ]
total_bill
tip
smoker
day
time
size
tip_pct
0
16.99
1.01
No
Sun
Dinner
2
0.059447
1
10.34
1.66
No
Sun
Dinner
3
0.160542
2
21.01
3.50
No
Sun
Dinner
3
0.166587
3
23.68
3.31
No
Sun
Dinner
2
0.139780
4
24.59
3.61
No
Sun
Dinner
4
0.146808
5
25.29
4.71
No
Sun
Dinner
4
0.186240
1
grouped = tips.groupby(['day' , 'smoker' ])
1 2
grouped_pct = grouped['tip_pct' ] grouped_pct.agg('mean' )
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
1
grouped_pct.agg(['mean' , 'std' ])
mean
std
day
smoker
Fri
No
0.151650
0.028123
Yes
0.174783
0.051293
Sat
No
0.158048
0.039767
Yes
0.147906
0.061375
Sun
No
0.160113
0.042347
Yes
0.187250
0.154134
Thur
No
0.160298
0.038774
Yes
0.163863
0.039389
1
grouped_pct.agg([('foo' , 'mean' ), ('bar' , np.std)])
foo
bar
day
smoker
Fri
No
0.151650
0.028123
Yes
0.174783
0.051293
Sat
No
0.158048
0.039767
Yes
0.147906
0.061375
Sun
No
0.160113
0.042347
Yes
0.187250
0.154134
Thur
No
0.160298
0.038774
Yes
0.163863
0.039389
1 2 3
functions = ['count' , 'mean' , 'max' ] result = grouped['tip_pct' , 'total_bill' ].agg(functions) result
tip_pct
total_bill
count
mean
max
count
mean
max
day
smoker
Fri
No
4
0.151650
0.187735
4
18.420000
22.75
Yes
15
0.174783
0.263480
15
16.813333
40.17
Sat
No
45
0.158048
0.291990
45
19.661778
48.33
Yes
42
0.147906
0.325733
42
21.276667
50.81
Sun
No
57
0.160113
0.252672
57
20.506667
48.17
Yes
19
0.187250
0.710345
19
24.120000
45.35
Thur
No
45
0.160298
0.266312
45
17.113111
41.19
Yes
17
0.163863
0.241255
17
19.190588
43.11
分位数和桶分析
1 2 3 4 5 6
frame = pd.DataFrame({'data1' : np.random.randn(1000 ), 'data2' : np.random.randn(1000 )}) quartiles = pd.cut(frame.data1, 4 ) quartiles[:10 ]
0 (-2.845, -1.307]
1 (0.225, 1.757]
2 (0.225, 1.757]
3 (0.225, 1.757]
4 (-1.307, 0.225]
5 (0.225, 1.757]
6 (0.225, 1.757]
7 (-1.307, 0.225]
8 (1.757, 3.29]
9 (-1.307, 0.225]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.845, -1.307] < (-1.307, 0.225] < (0.225, 1.757] < (1.757, 3.29]]
1 2 3 4 5
def (group) : return {'min' : group.min(), 'max' : group.max(), 'count' : group.count(), 'mean' : group.mean()} grouped = frame.data2.groupby(quartiles) grouped.apply(get_stats).unstack()
count
max
mean
min
data1
(-2.845, -1.307]
106.0
1.826602
0.117032
-2.382809
(-1.307, 0.225]
478.0
3.553988
0.079556
-2.690484
(0.225, 1.757]
379.0
2.647662
0.005168
-2.656434
(1.757, 3.29]
37.0
1.995249
0.024612
-1.838962
示例
后续补上