# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
'''
分组groupby
'''
df=pd.DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.arange(5),
'data2':np.arange(5)})
print(df)
# key1 key2 data1 data2
# 0 a one 0 0
# 1 a two 1 1
# 2 b one 2 2
# 3 b two 3 3
# 4 a one 4 4
'''
根据分组进行计算
'''
#按key1分组,计算data1的平均值
grouped=df['data1'].groupby(df['key1'])
print(grouped.mean())
# a 1.666667
# b 2.500000
#按key1和key2分组,计算data1的平均值
groupedmean=df['data1'].groupby([df['key1'],df['key2']]).mean()
print(groupedmean)
# key1 key2
# a one 2
# two 1
# b one 2
# two 3
#列变行
print(groupedmean.unstack())
# key2 one two
# key1
# a 2 1
# b 2 3
df['key1']#获取出来的数据series数据
#groupby分组键可以是series还可以是数组
states=np.array(['Oh','Ca','Ca','Oh','Oh'])
years=np.array([2005,2005,2006,2005,2006])
print(df['data1'].groupby([states,years]).mean())
# Ca 2005 1.0
# 2006 2.0
# Oh 2005 1.5
# 2006 4.0
#直接将列名进行分组,非数据项不在其中,非数据项会自动排除分组
print(df.groupby('key1').mean())
# data1 data2
# key1
# a 1.666667 1.666667
# b 2.500000 2.500000
#将入key2分组
print(df.groupby(['key1','key2']).mean())
# data1 data2
# key1 key2
# a one 2 2
# two 1 1
# b one 2 2
# two 3 3
#size()方法,返回含有分组大小的Series,得到分组的数量
print(df.groupby(['key1','key2']).size())
# key1 key2
# a one 2
# two 1
# b one 1
# two 1
'''
对分组信息进行迭代
'''
#将a,b进行分组
for name,group in df.groupby('key1'):
print(name)
print(group)
# a
# key1 key2 data1 data2
# 0 a one 0 0
# 1 a two 1 1
# 4 a one 4 4
# b
# key1 key2 data1 data2
# 2 b one 2 2
# 3 b two 3 3
#根据多个建进行分组
for (k1,k2),group in df.groupby(['key1','key2']):
print(name)
print(group)
# key1 key2 data1 data2
# 0 a one 0 0
# 4 a one 4 4
# b
# key1 key2 data1 data2
# 1 a two 1 1
# b
# key1 key2 data1 data2
# 2 b one 2 2
# b
# key1 key2 data1 data2
# 3 b two 3 3
'''
将数据分组生成字典**********************************************************************************
'''
pieces=dict(list(df.groupby('key1')))
print(pieces)
# {'a': key1 key2 data1 data2
# 0 a one 0 0
# 1 a two 1 1
# 4 a one 4 4, 'b': key1 key2 data1 data2
# 2 b one 2 2
# 3 b two 3 3}
print(pieces['a'])
# key1 key2 data1 data2
# 0 a one 0 0
# 1 a two 1 1
# 4 a one 4 4
'''
根据数据类型进行分组
'''
print(df.dtypes)
# key1 object
# key2 object
# data1 int32
# data2 int32
#列项分组
one=df.groupby(df.dtypes,axis=1)
#行项分组
two=df.groupby(df.dtypes,axis=0)
print(dict(list(one)))
# {dtype('int32'): data1 data2
# 0 0 0
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4, dtype('O'): key1 key2
# 0 a one
# 1 a two
# 2 b one
# 3 b two
# 4 a one}
print(dict(list(two)))
# {}
'''
选取一个或一组列,返回的Series的分组对象
'''
#对于groupBy对象,如果用一个或一组列名进行索引。就会聚合
print(df.groupby(df['key1'])['data1'])#根据key1分组,生成data1的数据
print(df.groupby(['key1'])[['data1','data2']].mean())#根据key1分组,生成data1,data2的数据
# data1 data2
# key1
# a 1.666667 1.666667
# b 2.500000 2.500000
print(df.groupby(['key1','key2'])['data1'].mean())
# key1 key2
# a one 2
# two 1
# b one 2
# two 3
#另一种的方式
print(df['data1'].groupby(df['key1']))#根据key1分组,生成data1的数据
print(df['data1'].groupby([df['key1'],df['key2']]).mean())#根据key1,key2分组,生成data1的数据
# key1 key2
# a one 2
# two 1
# b one 2
# two 3
'''
通过字典或者Series分组
'''
#上面已经写了根据Series分组了,这里是根据字典分组
people=pd.DataFrame(np.arange(25).reshape(5,5),columns=list('abcde'),index=['北京','上海','广州','深圳','杭州3'])
print(people)
# a b c d e
# 北京 0 1 2 3 4
# 上海 5 6 7 8 9
# 广州 10 11 12 13 14
# 深圳 15 16 17 18 19
# 杭州 20 21 22 23 24
#设置几个Na值
people.ix[2:3,1:3]=np.nan
print(people)
# a b c d e
# 北京 0 1.0 2.0 3 4
# 上海 5 6.0 7.0 8 9
# 广州 10 NaN NaN 13 14
# 深圳 15 16.0 17.0 18 19
# 杭州 20 21.0 22.0 23 24
people=people.T
print(people)
# 北京 上海 广州 深圳 杭州
# a 0.0 5.0 10.0 15.0 20.0
# b 1.0 6.0 NaN 16.0 21.0
# c 2.0 7.0 NaN 17.0 22.0
# d 3.0 8.0 13.0 18.0 23.0
# e 4.0 9.0 14.0 19.0 24.0
mapping={'北京':'环境好','上海':'环境好','广州':'经济好','深圳':'经济好','杭州':'环境好'}
#这里的key值是与矩阵的列相对应的,所以讲mapping传递给矩阵
by_columns=people.groupby(mapping,axis=1)
print(by_columns.sum())
# 环境好 经济好
# a 25.0 25.0
# b 28.0 16.0
# c 31.0 17.0
# d 34.0 31.0
# e 37.0 33.0
'''
通过函数进行分组
'''
#加入你能根据人名长度进行分组的话,就直接传入len函数
print(people.groupby(len,axis=1).sum())#杭州3是三个字母
# 2 3
# a 30.0 20.0
# b 23.0 21.0
# c 26.0 22.0
# d 42.0 23.0
# e 46.0 24.0
#还可以和数组、字典、列表、Series混合使用
key_list=['one','one','one','two','two']
print(people.groupby([len,key_list],axis=1).min())
# 2 3
# one two two
# a 0.0 15.0 20.0
# b 1.0 16.0 21.0
# c 2.0 17.0 22.0
# d 3.0 18.0 23.0
# e 4.0 19.0 24.0
'''
根据索引级别分组
'''
columns=pd.MultiIndex.from_arrays([['US',"US",'US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df=pd.DataFrame(np.random.randn(4,5),columns=columns)
print(hier_df)
# cty US JP
# tenor 1 3 5 1 3
# 0 -1.507729 2.112678 0.841736 -0.158109 -0.645219
# 1 0.355262 0.765209 -0.287648 1.134998 -0.440188
# 2 1.049813 0.763482 -0.362013 -0.428725 -0.355601
# 3 -0.868420 -1.213398 -0.386798 0.137273 0.678293
#根据级别分组
print(hier_df.groupby(level='cty',axis=1).count())
# cty JP US
# 0 2 3
# 1 2 3
# 2 2 3
# 3 2 3
python数据分析二二:pandas的groupBy分组对象
猜你喜欢
转载自blog.csdn.net/qq_38788128/article/details/80827936
今日推荐
周排行