#数据分组
#根据某些条件将数据进行拆分成组
#每个组独立应用函数
#将结果合并到一个数据结构中
import numpy as np
import pandas as pd
In [4]:
#分组
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','foo','bar','bar'],
'B':['one','two','three','two','two','one','three','one'],
'C':np.random.randn(8),
'D':np.random.randn(8)})
In [5]:
df
Out[5]:
A B C D
0 foo one 0.444202 1.406586
1 bar two -0.311666 1.196347
2 foo three 0.440234 0.949232
3 bar two -1.578572 2.464325
4 foo two -1.353510 0.773391
5 foo one 0.307378 -0.492570
6 bar three 1.446811 -2.350776
7 bar one -2.097978 0.296710
In [10]:
df.groupby(['A','B']).mean()
Out[10]:
C D
A B
bar one -2.097978 0.296710
three 1.446811 -2.350776
two -0.945119 1.830336
foo one 0.375790 0.457008
three 0.440234 0.949232
two -1.353510 0.773391
In [26]:
df.groupby(['A']).mean()
Out[26]:
C D
A
bar -0.635351 0.401652
foo -0.040424 0.659160
In [21]:
list(df.groupby(['A']))[1]list(df.groupby(['A']))[1]
Out[21]:
('foo', A B C D
0 foo one 0.444202 1.406586
2 foo three 0.440234 0.949232
4 foo two -1.353510 0.773391
5 foo one 0.307378 -0.492570)
In [24]:
list(df.groupby(['A']))[0]
Out[24]:
('bar', A B C D
1 bar two -0.311666 1.196347
3 bar two -1.578572 2.464325
6 bar three 1.446811 -2.350776
7 bar one -2.097978 0.296710)
In [28]:
df.groupby(['A','B']).size()
Out[28]:
A B
bar one 1
three 1
two 2
foo one 2
three 1
two 1
dtype: int64
In [31]:
df.groupby(['A']).groups
Out[31]:
{'bar': [1, 3, 6, 7], 'foo': [0, 2, 4, 5]}
In [51]:
s = pd.Series([1,2,3,10,20,30],index=[1,2,3,1,2,3])
grouped = s.groupby(level=0)
print(s)
print(grouped.first())#显示非NaN的第一个值
print(grouped.last())#显示非NaN的最后一个值
print(grouped.sum())#显示非NaN的和
print(grouped.mean)#平均值
print(grouped.median())#中值
print(grouped.count())#计数
print(grouped.min())#最小
print(grouped.max())#最大
print(grouped.std())#标准差
print(grouped.var())#方差
print(grouped.prod())#积
1 1
2 2
3 3
1 10
2 20
3 30
dtype: int64
1 1
2 2
3 3
dtype: int64
1 10
2 20
3 30
dtype: int64
1 11
2 22
3 33
dtype: int64
<bound method GroupBy.mean of <pandas.core.groupby.SeriesGroupBy object at 0x00000000049BBA90>>
1 5.5
2 11.0
3 16.5
dtype: float64
1 2
2 2
3 2
dtype: int64
1 1
2 2
3 3
dtype: int64
1 10
2 20
3 30
dtype: int64
1 6.363961
2 12.727922
3 19.091883
dtype: float64
1 40.5
2 162.0
3 364.5
dtype: float64
1 10
2 40
3 90
dtype: int64
In [53]:
df = pd.DataFrame({'a':[1,1,2,2],
'b':np.random.rand(4),
'c':np.random.rand(4),
'd':np.random.rand(4)})
df
Out[53]:
a b c d
0 1 0.390076 0.664425 0.493986
1 1 0.534739 0.378918 0.813577
2 2 0.894389 0.680243 0.294173
3 2 0.741806 0.223494 0.160900
In [56]:
print(df.groupby('a').agg(['mean',np.sum]))
b c d
mean sum mean sum mean sum
a
1 0.462407 0.924815 0.521671 1.043342 0.653782 1.307563
2 0.818098 1.636195 0.451869 0.903737 0.227536 0.455073
In [58]:
print(df.groupby('a')['b'].agg({'result1':np.mean,
'result2':np.sum}))
result1 result2
a
1 0.462407 0.924815
2 0.818098 1.636195
In [61]:
#小作业
df = pd.DataFrame({'A':['one','two','three','one','two','three','one','two'],
'B':['h','h','h','h','f','f','f','f'],
'C':[10,12,14,16,18,20,22,24],
'D':np.random.randn(8),
'E':np.random.rand(8)})
df
Out[61]:
A B C D E
0 one h 10 -1.188879 0.771559
1 two h 12 -0.414063 0.743417
2 three h 14 -0.241158 0.182954
3 one h 16 0.381358 0.100378
4 two f 18 -0.101517 0.291719
5 three f 20 -0.808872 0.007264
6 one f 22 -1.164982 0.351209
7 two f 24 -1.144294 0.831537
In [62]:
df.groupby('A')['C','D'].mean()
Out[62]:
C D
A
one 16 -0.657501
three 17 -0.525015
two 18 -0.553291
In [63]:
df.groupby(['A','B'])['D','E'].sum()
Out[63]:
D E
A B
one f -1.164982 0.351209
h -0.807521 0.871937
three f -0.808872 0.007264
h -0.241158 0.182954
two f -1.245811 1.123255
h -0.414063 0.743417
In [78]:
print(dict(list(df.groupby('A'))))
{'three': A B C D E
2 three h 14 -0.241158 0.182954
5 three f 20 -0.808872 0.007264, 'two': A B C D E
1 two h 12 -0.414063 0.743417
4 two f 18 -0.101517 0.291719
7 two f 24 -1.144294 0.831537, 'one': A B C D E
0 one h 10 -1.188879 0.771559
3 one h 16 0.381358 0.100378
6 one f 22 -1.164982 0.351209}
In [72]:
df2 = df[['C','D']]
df2['sum'] = df2.sum(axis = 1)
df2
C:\Program Files\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from ipykernel import kernelapp as app
Out[72]:
C D sum
0 10 -1.188879 8.811121
1 12 -0.414063 11.585937
2 14 -0.241158 13.758842
3 16 0.381358 16.381358
4 18 -0.101517 17.898483
5 20 -0.808872 19.191128
6 22 -1.164982 20.835018
7 24 -1.144294 22.855706
Python之Pandas(5)
猜你喜欢
转载自blog.csdn.net/weixin_38452632/article/details/83662711
今日推荐
周排行