Pyhton科学计算工具Pandas(九)—— 数据分组
分组统计 - groupby功能
- 根据某些条件将数据拆分成组
- 对每个组独立应用函数
- 将结果合并到一个数据结构中
Dataframe在行(axis=0)或列(axis=1)上进行分组,将一个函数应用到各个分组并产生一个新值,然后函数执行结果被合并到最终的结果对象中。
df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)
分组的基本操作
分组
#分组
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
print('-----')
dfa = df.groupby('A')
print(dfa.size(), type(dfa))
#groupby之后的数据并不是DataFrame格式的数据,而是特殊的groupby类型
#可以通过size()方法返回分组后的记录数目的统计结果
print('========')
a = df.groupby('A').sum()
print(a, type(a))
b = df.groupby(['A','B']).mean()
print(b, type(b))
c = df.groupby('A')['D'].mean()
print(c, type(c))
# 通过分组后的计算,得到一个新的dataframe
# 默认axis = 0,以行来分组
# 可单个或多个([])列分组
A B C D
0 foo one 2.479737 -2.368789
1 bar one 1.028346 0.950277
2 foo two 1.001758 -1.278156
3 bar three -0.205714 -0.330909
4 foo two 0.337572 1.256110
5 bar two 0.244171 -0.820276
6 foo one 0.554198 0.683419
7 foo three -0.534419 -0.319840
-----
A
bar 3
foo 5
dtype: int64 <class 'pandas.core.groupby.DataFrameGroupBy'>
========
C D
A
bar 1.066804 -0.200907
foo 3.838847 -2.027256 <class 'pandas.core.frame.DataFrame'>
C D
A B
bar one 1.028346 0.950277
three -0.205714 -0.330909
two 0.244171 -0.820276
foo one 1.516967 -0.842685
three -0.534419 -0.319840
two 0.669665 -0.011023 <class 'pandas.core.frame.DataFrame'>
A
bar -0.066969
foo -0.405451
Name: D, dtype: float64 <class 'pandas.core.series.Series'>
分组是一个可迭代的对象
# 分组 - 可迭代对象
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))
print('-----')
print(list(df.groupby('X')), '→ 可迭代对象,直接生成list\n')
print(list(df.groupby('X'))[0], '→ 以元祖形式显示\n')
for n,g in df.groupby('X'):
print(n)
print(g, type(g))
print('======')
# n是组名,g是分组后的Dataframe
X Y
0 A 1
1 B 4
2 A 3
3 B 2
<pandas.core.groupby.DataFrameGroupBy object at 0x000002AF2EE7C080> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A', X Y
0 A 1
2 A 3), ('B', X Y
1 B 4
3 B 2)] → 可迭代对象,直接生成list
('A', X Y
0 A 1
2 A 3) → 以元祖形式显示
A
X Y
0 A 1
2 A 3 <class 'pandas.core.frame.DataFrame'>
======
B
X Y
1 B 4
3 B 2 <class 'pandas.core.frame.DataFrame'>
======
选择分组 .get_group()
# 提取分组后的某组
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('-------')
print(df.groupby('X').get_group('A'))
print('-------')
X Y
0 A 1
1 B 4
2 A 3
3 B 2
-------
X Y
0 A 1
2 A 3
将分组转化为字典 .groups
# 将分组转化为字典
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('---------')
a = df.groupby('X')
print(a.groups,'\n')
print(a.groups['A'],'\n')
print(a.groups['A'][0])
# 字典的值为index
X Y
0 A 1
1 B 4
2 A 3
3 B 2
---------
{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')}
Int64Index([0, 2], dtype='int64')
0
查看分组里的记录数 .size()
# .size() 查看分组中的记录的统计数目
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('====')
a = df.groupby('X')
print(a.size())
X Y
0 A 1
1 B 4
2 A 3
3 B 2
====
X
A 2
B 2
dtype: int64
多个列分组
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df)
print('---------')
print(grouped)
print('=====')
print(grouped[('foo', 'three')])
# 按照两个列进行分组
A B C D
0 foo one -0.539735 0.252334
1 bar one 1.247811 -0.144133
2 foo two -0.965486 0.042095
3 bar three -0.158520 -0.667123
4 foo two 1.283692 1.201100
5 bar two -0.795091 0.368176
6 foo one -0.263945 0.085682
7 foo three 0.710263 -1.238407
---------
{('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'two'): Int64Index([2, 4], dtype='int64')}
=====
Int64Index([7], dtype='int64')
在其他轴上分组
df = pd.DataFrame({'data1':np.random.rand(2),
'data2':np.random.rand(2),
'key1':['a','b'],
'key2':['one','two']})
print(df)
print('------')
print(df.dtypes)
print('------')
for n,p in df.groupby(df.dtypes, axis=1):
print(n)
print(p)
print('===')
# 按照值类型分列
data1 data2 key1 key2
0 0.257623 0.81153 a one
1 0.325821 0.78845 b two
------
data1 float64
data2 float64
key1 object
key2 object
dtype: object
------
float64
data1 data2
0 0.257623 0.81153
1 0.325821 0.78845
===
object
key1 key2
0 a one
1 b two
===
通过字典或者Series分组
# 通过字典或者Series分组
df = pd.DataFrame(np.arange(16).reshape(4,4),
columns = ['a','b','c','d'])
print(df)
print('-----')
mapping = {'a':'one','b':'one','c':'two','d':'two','e':'three'}
by_column = df.groupby(mapping, axis = 1)
print(by_column.sum())
print('-----')
# mapping中,a、b列对应的为one,c、d列对应的为two,以字典来分组
s = pd.Series(mapping)
print(s,'\n')
print(s.groupby(s).count())
# s中,index中a、b对应的为one,c、d对应的为two,以Series来分组
'''??????'''
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
-----
one two
0 1 5
1 9 13
2 17 21
3 25 29
-----
a one
b one
c two
d two
e three
dtype: object
one 2
three 1
two 2
dtype: int64
'??????'
通过函数分组
# 通过函数分组
df = pd.DataFrame(np.arange(16).reshape(4,4),
columns = ['a','b','c','d'],
index = ['abc','bcd','aa','b'])
print(df,'\n')
print(df.groupby(len).sum())
# 按照字母长度分组
a b c d
abc 0 1 2 3
bcd 4 5 6 7
aa 8 9 10 11
b 12 13 14 15
a b c d
1 12 13 14 15
2 8 9 10 11
3 4 6 8 10
分组中常见的函数
# 分组计算函数方法
s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0) # 唯一索引用.groupby(level=0),将同一个index的分为一组
print(grouped)
print(grouped.first(),'→ first:非NaN的第一个值\n')
print(grouped.last(),'→ last:非NaN的最后一个值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算术中位数\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的标准差和方差\n')
print(grouped.prod(),'→ prod:非NaN的积\n')
<pandas.core.groupby.SeriesGroupBy object at 0x000002AF2F1B7278>
1 1
2 2
3 3
dtype: int64 → first:非NaN的第一个值
1 10
2 20
3 30
dtype: int64 → last:非NaN的最后一个值
1 11
2 22
3 33
dtype: int64 → sum:非NaN的和
1 5.5
2 11.0
3 16.5
dtype: float64 → mean:非NaN的平均值
1 5.5
2 11.0
3 16.5
dtype: float64 → median:非NaN的算术中位数
1 2
2 2
3 2
dtype: int64 → count:非NaN的值
1 1
2 2
3 3
dtype: int64 → min、max:非NaN的最小值、最大值
1 6.363961
2 12.727922
3 19.091883
dtype: float64 → std,var:非NaN的标准差和方差
1 10
2 40
3 90
dtype: int64 → prod:非NaN的积
多函数计算
# 多函数计算:agg()
df = pd.DataFrame({'a':[1,1,2,2],
'b':np.random.randint(100, size=4),
'c':np.random.randint(100, size=4),
'd':np.random.randint(100, size=4)})
print(df)
print(df.groupby('a').agg(['mean',sum]))
print(df.groupby('a')['b'].agg({'mean':np.mean,
'sum':'sum'}))
# 函数写法可以用str,或者np.方法
# 可以通过list,dict传入,当用dict时,key名为columns
a b c d
0 1 47 0 61
1 1 83 52 2
2 2 54 77 87
3 2 52 99 97
b c d
mean sum mean sum mean sum
a
1 65 130 26 52 31.5 63
2 53 106 88 176 92.0 184
mean sum
a
1 65 130
2 53 106
F:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
# Remove the CWD from sys.path while we load stuff.
分组转换
数据分组转换 transform
# 数据分组转换,transform
df = pd.DataFrame({'data1':np.random.randint(100, size=5),
'data2':np.random.randint(100, size=5),
'key1':list('aabba'),
'key2':['one','two','one','two','one']})
k_mean = df.groupby('key1').mean()
print(df)
print(k_mean)
print(pd.merge(df, k_mean, left_on='key1', right_index=True).add_prefix('mean_')) # .add_prefix('mean_'):添加前缀
print('============')
# 通过分组、合并,得到一个包含均值的Dataframe
print(df.groupby('key2').mean()) # 按照key2分组求均值
print(df.groupby('key2').transform(np.mean))
# data1、data2每个位置元素取对应分组列的均值
# 字符串不能进行计算
data1 data2 key1 key2
0 7 98 a one
1 77 3 a two
2 50 73 b one
3 74 23 b two
4 21 9 a one
data1 data2
key1
a 35.0 36.666667
b 62.0 48.000000
mean_data1_x mean_data2_x mean_key1 mean_key2 mean_data1_y mean_data2_y
0 7 98 a one 35.0 36.666667
1 77 3 a two 35.0 36.666667
4 21 9 a one 35.0 36.666667
2 50 73 b one 62.0 48.000000
3 74 23 b two 62.0 48.000000
============
data1 data2
key2
one 26.0 60.0
two 75.5 13.0
data1 data2
0 26.0 60
1 75.5 13
2 26.0 60
3 75.5 13
4 26.0 60
一般化Groupby方法:apply
# 一般化Groupby方法:apply
df = pd.DataFrame({'data1':np.random.randint(100, size=5),
'data2':np.random.randint(100, size=5),
'key1':list('aabba'),
'key2':['one','two','one','two','one']})
print(df.groupby('key1').apply(lambda x: x.describe()))
# apply直接运行其中的函数
# 这里为匿名函数,描述性统计
print('=========================')
def f_df1(d,n):
return(d.sort_index()[:n])
def f_df2(d,k1):
return(d[k1])
print(df.groupby('key1').apply(f_df1,2),'\n')
print(df.groupby('key1').apply(f_df2,'data2'))
print(type(df.groupby('key1').apply(f_df2,'data2')))
# f_df1函数:返回排序后的前n行数据
# f_df2函数:返回分组后表的k1列,结果为Series,层次化索引
# 直接运行f_df函数
# 参数直接写在后面,也可以为.apply(f_df,n = 2))
data1 data2
key1
a count 3.000000 3.000000
mean 39.666667 47.333333
std 45.566801 33.306656
min 4.000000 10.000000
25% 14.000000 34.000000
50% 24.000000 58.000000
75% 57.500000 66.000000
max 91.000000 74.000000
b count 2.000000 2.000000
mean 25.500000 18.500000
std 3.535534 16.263456
min 23.000000 7.000000
25% 24.250000 12.750000
50% 25.500000 18.500000
75% 26.750000 24.250000
max 28.000000 30.000000
=========================
data1 data2 key1 key2
key1
a 0 4 10 a one
1 91 58 a two
b 2 28 7 b one
3 23 30 b two
key1
a 0 10
1 58
4 74
b 2 7
3 30
Name: data2, dtype: int32
<class 'pandas.core.series.Series'>