ch09-GroupBy

split->apply->combine

1
2
import numpy as np
import pandas as pd
1
2
3
4
5
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df
key1 key2 data1 data2
0 a one 1.587125 -0.517650
1 a two 0.206854 1.503013
2 b one 1.074688 -1.310088
3 b two 0.306591 2.236456
4 a one 0.462624 0.643336
1
2
3

grouped = df['data1'].groupby(df['key1'])
grouped #一个groupby对象
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x10637ba20>
1
grouped.mean()
key1
a    0.752201
b    0.690639
Name: data1, dtype: float64
1
grouped.describe()
count mean std min 25% 50% 75% max
key1
a 3.0 0.752201 0.734288 0.206854 0.334739 0.462624 1.024874 1.587125
b 2.0 0.690639 0.543127 0.306591 0.498615 0.690639 0.882664 1.074688
1
2
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
key1  key2
a     one     1.024874
      two     0.206854
b     one     1.074688
      two     0.306591
Name: data1, dtype: float64
1
means.unstack()
key2 one two
key1
a 1.024874 0.206854
b 1.074688 0.306591
1
2
3
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
California  2005    0.206854
            2006    1.074688
Ohio        2005    0.946858
            2006    0.462624
Name: data1, dtype: float64
1
df.groupby('key1').mean()
data1 data2
key1
a 0.752201 0.542900
b 0.690639 0.463184
1
df.groupby(['key1','key2']).mean()
data1 data2
key1 key2
a one 1.024874 0.062843
two 0.206854 1.503013
b one 1.074688 -1.310088
two 0.306591 2.236456

对分组进行迭代

1
2
3
4
5
#GroupBy对象支持迭代,可以产生一组二元元组

for name,group in df.groupby('key1'):
print(name)
print(group)
a
  key1 key2     data1     data2
0    a  one  1.587125 -0.517650
1    a  two  0.206854  1.503013
4    a  one  0.462624  0.643336
b
  key1 key2     data1     data2
2    b  one  1.074688 -1.310088
3    b  two  0.306591  2.236456
1
2
3
4
5
#多个键则返回由键值组成的元组

for name,group in df.groupby(['key1','key2']):
print(name)
print(group)
('a', 'one')
  key1 key2     data1     data2
0    a  one  1.587125 -0.517650
4    a  one  0.462624  0.643336
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.206854  1.503013
('b', 'one')
  key1 key2     data1     data2
2    b  one  1.074688 -1.310088
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.306591  2.236456
1
2
3
#造个字典
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1 key2 data1 data2
2 b one 1.074688 -1.310088
3 b two 0.306591 2.236456

选取一个或者一列

1
df['data1'].groupby(df['key1']).describe()
count mean std min 25% 50% 75% max
key1
a 3.0 0.752201 0.734288 0.206854 0.334739 0.462624 1.024874 1.587125
b 2.0 0.690639 0.543127 0.306591 0.498615 0.690639 0.882664 1.074688
1
df.groupby(['key1'])['data1'].describe()  #单独选取‘data1’这一列
count mean std min 25% 50% 75% max
key1
a 3.0 0.752201 0.734288 0.206854 0.334739 0.462624 1.024874 1.587125
b 2.0 0.690639 0.543127 0.306591 0.498615 0.690639 0.882664 1.074688

通过字典或Series进行分组

1
2
3
4
5
people = pd.DataFrame(np.random.randn(5, 5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people
a b c d e
Joe -0.323368 -0.768278 -0.169238 0.308583 -0.459496
Steve -0.597247 -0.301618 -0.439395 -1.482591 -0.019049
Wes 0.100330 NaN NaN 0.457292 1.192268
Jim -0.223226 -0.196377 2.264969 -0.982435 -1.851675
Travis -1.245037 -0.080968 -0.509985 0.928633 -0.474299
1
2
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
'd': 'blue', 'e': 'red', 'f' : 'orange'}
1
2
by_columns = people.groupby(mapping,axis=1) #要和轴长度一致
by_columns.sum()
blue red
Joe 0.139345 -1.551143
Steve -1.921986 -0.917914
Wes 0.457292 1.292598
Jim 1.282534 -2.271279
Travis 0.418648 -1.800303

通过函数进行分组

1
people.groupby(len).sum()
a b c d e
3 -0.446265 -0.964655 2.095731 -0.216560 -1.118904
5 -0.597247 -0.301618 -0.439395 -1.482591 -0.019049
6 -1.245037 -0.080968 -0.509985 0.928633 -0.474299

根据索引级别分组

1
2
3
4
5
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
[1, 3, 5, 1, 3]],
names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
cty US JP
tenor 1 3 5 1 3
0 -0.297194 0.028656 1.206242 1.330768 -0.267134
1 0.104323 -1.416412 -0.602244 -0.442782 -0.654808
2 1.199846 1.050484 0.077949 -0.214932 1.186233
3 0.214656 0.337322 -0.324943 1.581862 0.821636

数据聚合

主要讲sum(),count()等函数计算时到底发生了什么,我们也可以使用自己定义的聚合函数。

1
2
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
key1
a    1.362225
b    0.997878
Name: data1, dtype: float64
1
grouped.describe().stack().head()
data1 data2
key1
a count 3.000000 3.000000
mean 0.752201 0.542900
std 0.734288 1.014069
min 0.206854 -0.517650
25% 0.334739 0.062843

分组运算和转换

1
#pass transform()

apply:一般性的‘拆分-应用-合并’

最一般化的GroupBy方法

1
2
3
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]
total_bill tip smoker day time size tip_pct
0 16.99 1.01 No Sun Dinner 2 0.059447
1 10.34 1.66 No Sun Dinner 3 0.160542
2 21.01 3.50 No Sun Dinner 3 0.166587
3 23.68 3.31 No Sun Dinner 2 0.139780
4 24.59 3.61 No Sun Dinner 4 0.146808
5 25.29 4.71 No Sun Dinner 4 0.186240
1
grouped = tips.groupby(['day', 'smoker'])
1
2
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64
1
grouped_pct.agg(['mean', 'std'])
mean std
day smoker
Fri No 0.151650 0.028123
Yes 0.174783 0.051293
Sat No 0.158048 0.039767
Yes 0.147906 0.061375
Sun No 0.160113 0.042347
Yes 0.187250 0.154134
Thur No 0.160298 0.038774
Yes 0.163863 0.039389
1
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])  #传入tuple
foo bar
day smoker
Fri No 0.151650 0.028123
Yes 0.174783 0.051293
Sat No 0.158048 0.039767
Yes 0.147906 0.061375
Sun No 0.160113 0.042347
Yes 0.187250 0.154134
Thur No 0.160298 0.038774
Yes 0.163863 0.039389
1
2
3
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result
tip_pct total_bill
count mean max count mean max
day smoker
Fri No 4 0.151650 0.187735 4 18.420000 22.75
Yes 15 0.174783 0.263480 15 16.813333 40.17
Sat No 45 0.158048 0.291990 45 19.661778 48.33
Yes 42 0.147906 0.325733 42 21.276667 50.81
Sun No 57 0.160113 0.252672 57 20.506667 48.17
Yes 19 0.187250 0.710345 19 24.120000 45.35
Thur No 45 0.160298 0.266312 45 17.113111 41.19
Yes 17 0.163863 0.241255 17 19.190588 43.11

分位数和桶分析

1
2
3
4
5
6
#和pandas的cut|qcut结合

frame = pd.DataFrame({'data1': np.random.randn(1000),
'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]
0    (-2.845, -1.307]
1      (0.225, 1.757]
2      (0.225, 1.757]
3      (0.225, 1.757]
4     (-1.307, 0.225]
5      (0.225, 1.757]
6      (0.225, 1.757]
7     (-1.307, 0.225]
8       (1.757, 3.29]
9     (-1.307, 0.225]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.845, -1.307] < (-1.307, 0.225] < (0.225, 1.757] < (1.757, 3.29]]
1
2
3
4
5
def (group):
return {'min': group.min(), 'max': group.max(),
'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()
count max mean min
data1
(-2.845, -1.307] 106.0 1.826602 0.117032 -2.382809
(-1.307, 0.225] 478.0 3.553988 0.079556 -2.690484
(0.225, 1.757] 379.0 2.647662 0.005168 -2.656434
(1.757, 3.29] 37.0 1.995249 0.024612 -1.838962

示例

后续补上

猜你喜欢

转载自www.cnblogs.com/wangziqiang123/p/11712032.html
09