pandas高级应用

#分类数据
import pandas as pd
import numpy as np
values = pd.Series(['apple','orange','apple','apple']*2)
values
0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object
pd.unique(values)
array(['apple', 'orange'], dtype=object)
pd.value_counts(values)
apple     6
orange    2
dtype: int64
values = pd.Series([0,1,0,0]*2)
dim = pd.Series(['apple','orange'])
values
0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64
dim
0     apple
1    orange
dtype: object
dim.take(values)
0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruits':fruits,
                  'basket_id':np.arange(N),
                  'count':np.random.randint(3,15,size=N),
                  'weight':np.random.uniform(0,4,size=N)},
                  columns=['basket_id','fruits','count','weight'])
df
basket_id fruits count weight
0 0 apple 10 2.679414
1 1 orange 8 2.278047
2 2 apple 9 0.087745
3 3 apple 6 2.028924
4 4 apple 11 1.704697
5 5 orange 6 1.352336
6 6 apple 11 2.940028
7 7 apple 4 2.798046
fruit_cat = df['fruits'].astype('category')
fruit_cat
0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]
c = fruit_cat.values #分类对象
type(c)
pandas.core.arrays.categorical.Categorical
c.categories
Index(['apple', 'orange'], dtype='object')
c.codes
array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
df['fruits'] = df['fruits'].astype('category')
df.fruits
0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]
my_categories = pd.Categorical(['foo','bar','baz','foo','bar'])
my_categories
[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]
categories = ['foo','bar','baz']
codes = [0,1,2,0,0,1]
my_cats_2 = pd.Categorical.from_codes(codes,categories)
my_cats_2
[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]
#用分类进行计算
draws = np.random.randn(1000)
draws[:5]
array([ 1.41984629,  0.25818437, -0.78979829,  0.69114415,  0.58610681])
bins = pd.qcut(draws,4)
bins
[(0.714, 3.115], (0.0138, 0.714], (-2.7239999999999998, -0.658], (0.0138, 0.714], (0.0138, 0.714], ..., (-2.7239999999999998, -0.658], (0.714, 3.115], (0.0138, 0.714], (0.0138, 0.714], (0.0138, 0.714]]
Length: 1000
Categories (4, interval[float64]): [(-2.7239999999999998, -0.658] < (-0.658, 0.0138] < (0.0138, 0.714] < (0.714, 3.115]]
bins = pd.qcut(draws,4,labels=['Q1','Q2','Q3','Q4'])
bins
[Q4, Q3, Q1, Q3, Q3, ..., Q1, Q4, Q3, Q3, Q3]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]
bins = pd.Series(bins,name='quartile')
results = (pd.Series(draws).groupby(bins).agg(['count','min','max']).reset_index())
results
quartile count min max
0 Q1 250 -2.722817 -0.669126
1 Q2 250 -0.654161 0.011138
2 Q3 250 0.016389 0.713528
3 Q4 250 0.714217 3.115205
#用分类提高性能
N = 100000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo','bar','baz','qux']*(N//4))
categories = labels.astype('category')
labels.memory_usage()#占用内存
800080
categories.memory_usage()
100272
#分类方法
s = pd.Series(['a','b','c','d']*2)
cat_s = s.astype('category')
cat_s
0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]
cat_s.cat.codes
0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8
cat_s.cat.categories
Index(['a', 'b', 'c', 'd'], dtype='object')
actual_categories = ['a','b','c','d','e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2
0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): [a, b, c, d, e]
cat_s2.value_counts()
d    2
c    2
b    2
a    2
e    0
dtype: int64
#为建模创建虚拟变量
cat_s = pd.Series(['a','b','c','d']*2,dtype='category')
pd.get_dummies(cat_s)
a b c d
0 1 0 0 0
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1
4 1 0 0 0
5 0 1 0 0
6 0 0 1 0
7 0 0 0 1
#GroupBy高级应用
#分组转换和解封
df = pd.DataFrame({'key':['a','b','c']*4,
                  'value':np.arange(12.)})
df
key value
0 a 0.0
1 b 1.0
2 c 2.0
3 a 3.0
4 b 4.0
5 c 5.0
6 a 6.0
7 b 7.0
8 c 8.0
9 a 9.0
10 b 10.0
11 c 11.0
g = df.groupby('key').value
g.mean()
key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64
g.transform(lambda x:x.mean())
0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64
g.transform('mean')
0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64
g.transform(lambda x:x*2)
0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64
g.transform(lambda x:x.rank(ascending=False))
0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64
#分组的时间重采样
N = 15
times = pd.date_range('2017-05-20 00:00',freq='1min',periods=N)
df = pd.DataFrame({'time':times,
                  'values':np.arange(N)})
df
time values
0 2017-05-20 00:00:00 0
1 2017-05-20 00:01:00 1
2 2017-05-20 00:02:00 2
3 2017-05-20 00:03:00 3
4 2017-05-20 00:04:00 4
5 2017-05-20 00:05:00 5
6 2017-05-20 00:06:00 6
7 2017-05-20 00:07:00 7
8 2017-05-20 00:08:00 8
9 2017-05-20 00:09:00 9
10 2017-05-20 00:10:00 10
11 2017-05-20 00:11:00 11
12 2017-05-20 00:12:00 12
13 2017-05-20 00:13:00 13
14 2017-05-20 00:14:00 14
df.set_index('time').resample('5min').count()
values
time
2017-05-20 00:00:00 5
2017-05-20 00:05:00 5
2017-05-20 00:10:00 5
df2 = pd.DataFrame({'time': times.repeat(3),
                     'key': np.tile(['a', 'b', 'c'], N),
                        'value': np.arange(N * 3.)})
df2


time key value
0 2017-05-20 00:00:00 a 0.0
1 2017-05-20 00:00:00 b 1.0
2 2017-05-20 00:00:00 c 2.0
3 2017-05-20 00:01:00 a 3.0
4 2017-05-20 00:01:00 b 4.0
5 2017-05-20 00:01:00 c 5.0
6 2017-05-20 00:02:00 a 6.0
7 2017-05-20 00:02:00 b 7.0
8 2017-05-20 00:02:00 c 8.0
9 2017-05-20 00:03:00 a 9.0
10 2017-05-20 00:03:00 b 10.0
11 2017-05-20 00:03:00 c 11.0
12 2017-05-20 00:04:00 a 12.0
13 2017-05-20 00:04:00 b 13.0
14 2017-05-20 00:04:00 c 14.0
15 2017-05-20 00:05:00 a 15.0
16 2017-05-20 00:05:00 b 16.0
17 2017-05-20 00:05:00 c 17.0
18 2017-05-20 00:06:00 a 18.0
19 2017-05-20 00:06:00 b 19.0
20 2017-05-20 00:06:00 c 20.0
21 2017-05-20 00:07:00 a 21.0
22 2017-05-20 00:07:00 b 22.0
23 2017-05-20 00:07:00 c 23.0
24 2017-05-20 00:08:00 a 24.0
25 2017-05-20 00:08:00 b 25.0
26 2017-05-20 00:08:00 c 26.0
27 2017-05-20 00:09:00 a 27.0
28 2017-05-20 00:09:00 b 28.0
29 2017-05-20 00:09:00 c 29.0
30 2017-05-20 00:10:00 a 30.0
31 2017-05-20 00:10:00 b 31.0
32 2017-05-20 00:10:00 c 32.0
33 2017-05-20 00:11:00 a 33.0
34 2017-05-20 00:11:00 b 34.0
35 2017-05-20 00:11:00 c 35.0
36 2017-05-20 00:12:00 a 36.0
37 2017-05-20 00:12:00 b 37.0
38 2017-05-20 00:12:00 c 38.0
39 2017-05-20 00:13:00 a 39.0
40 2017-05-20 00:13:00 b 40.0
41 2017-05-20 00:13:00 c 41.0
42 2017-05-20 00:14:00 a 42.0
43 2017-05-20 00:14:00 b 43.0
44 2017-05-20 00:14:00 c 44.0
time_key = pd.TimeGrouper('5min')
resampled = (df2.set_index('time').groupby(['key',time_key]).sum())
resampled
C:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: pd.TimeGrouper is deprecated and will be removed; Please use pd.Grouper(freq=...)
  """Entry point for launching an IPython kernel.
value
key time
a 2017-05-20 00:00:00 30.0
2017-05-20 00:05:00 105.0
2017-05-20 00:10:00 180.0
b 2017-05-20 00:00:00 35.0
2017-05-20 00:05:00 110.0
2017-05-20 00:10:00 185.0
c 2017-05-20 00:00:00 40.0
2017-05-20 00:05:00 115.0
2017-05-20 00:10:00 190.0

猜你喜欢

转载自blog.csdn.net/DMU_lzq1996/article/details/83857020