分组对象与apply函数
- 函数apply即可用于分组对象,也可以作用于dataframe数据
- Groupby.apply(func)
- 需要注意axis=0和axis=1的区别
- np.sum, axis=0 相当于计算每列的总和
- np.sum, axis=1 相当于计算每行的总和
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据统计'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('online_order.csv', encoding='gbk', dtype={'customer':str, 'order':str})
df.head(5)
|
customer |
order |
total_items |
discount% |
weekday |
hour |
Food% |
Fresh% |
Drinks% |
Home% |
Beauty% |
Health% |
Baby% |
Pets% |
0 |
0 |
0 |
45 |
23.03 |
4 |
13 |
9.46 |
87.06 |
3.48 |
0.00 |
0.00 |
0.00 |
0.0 |
0.0 |
1 |
0 |
1 |
38 |
1.22 |
5 |
13 |
15.87 |
75.80 |
6.22 |
2.12 |
0.00 |
0.00 |
0.0 |
0.0 |
2 |
0 |
2 |
51 |
18.08 |
4 |
13 |
16.88 |
56.75 |
3.37 |
16.48 |
6.53 |
0.00 |
0.0 |
0.0 |
3 |
1 |
3 |
57 |
16.51 |
1 |
12 |
28.81 |
35.99 |
11.78 |
4.62 |
2.87 |
15.92 |
0.0 |
0.0 |
4 |
1 |
4 |
53 |
18.31 |
2 |
11 |
24.13 |
60.38 |
7.78 |
7.72 |
0.00 |
0.00 |
0.0 |
0.0 |
grouped = df.groupby('weekday')
grouped.apply(np.mean)[['total_items', 'discount%', 'weekday']]
|
total_items |
discount% |
weekday |
weekday |
|
|
|
1 |
30.662177 |
8.580705 |
1.0 |
2 |
31.868612 |
8.638014 |
2.0 |
3 |
31.869796 |
7.794507 |
3.0 |
4 |
32.251899 |
8.068155 |
4.0 |
5 |
31.406619 |
9.159031 |
5.0 |
6 |
32.154814 |
8.414258 |
6.0 |
7 |
32.373837 |
8.710171 |
7.0 |
df.columns
Index(['customer', 'order', 'total_items', 'discount%', 'weekday', 'hour',
'Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%',
'Pets%'],
dtype='object')
var = ['Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%',
'Pets%']
df[var].head(5)
|
Food% |
Fresh% |
Drinks% |
Home% |
Beauty% |
Health% |
Baby% |
Pets% |
0 |
9.46 |
87.06 |
3.48 |
0.00 |
0.00 |
0.00 |
0.0 |
0.0 |
1 |
15.87 |
75.80 |
6.22 |
2.12 |
0.00 |
0.00 |
0.0 |
0.0 |
2 |
16.88 |
56.75 |
3.37 |
16.48 |
6.53 |
0.00 |
0.0 |
0.0 |
3 |
28.81 |
35.99 |
11.78 |
4.62 |
2.87 |
15.92 |
0.0 |
0.0 |
4 |
24.13 |
60.38 |
7.78 |
7.72 |
0.00 |
0.00 |
0.0 |
0.0 |
df[var].apply(np.sum, axis=0)
Food% 706812.19
Fresh% 606818.38
Drinks% 700477.06
Home% 406187.25
Beauty% 176788.48
Health% 33988.76
Baby% 332884.34
Pets% 31292.61
dtype: float64
df[var].apply(np.sum, axis=1).head(5)
0 100.00
1 100.01
2 100.01
3 99.99
4 100.01
dtype: float64
df[var].apply(lambda x: x[0] - x[1], axis=1).head(5)
0 -77.60
1 -59.93
2 -39.87
3 -7.18
4 -36.25
dtype: float64