pandas高级应用

#分类数据
import pandas as pd
import numpy as np

values = pd.Series(['apple','orange','apple','apple']*2)
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

pd.unique(values)

array(['apple', 'orange'], dtype=object)

pd.value_counts(values)

apple     6
orange    2
dtype: int64

values = pd.Series([0,1,0,0]*2)
dim = pd.Series(['apple','orange'])
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

dim

0     apple
1    orange
dtype: object

dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruits':fruits,
                  'basket_id':np.arange(N),
                  'count':np.random.randint(3,15,size=N),
                  'weight':np.random.uniform(0,4,size=N)},
                  columns=['basket_id','fruits','count','weight'])
df

	basket_id	fruits	count	weight
0	0	apple	10	2.679414
1	1	orange	8	2.278047
2	2	apple	9	0.087745
3	3	apple	6	2.028924
4	4	apple	11	1.704697
5	5	orange	6	1.352336
6	6	apple	11	2.940028
7	7	apple	4	2.798046

fruit_cat = df['fruits'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]

c = fruit_cat.values #分类对象
type(c)

pandas.core.arrays.categorical.Categorical

c.categories

Index(['apple', 'orange'], dtype='object')

c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

df['fruits'] = df['fruits'].astype('category')
df.fruits

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]

my_categories = pd.Categorical(['foo','bar','baz','foo','bar'])
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

categories = ['foo','bar','baz']
codes = [0,1,2,0,0,1]
my_cats_2 = pd.Categorical.from_codes(codes,categories)
my_cats_2

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]

#用分类进行计算
draws = np.random.randn(1000)
draws[:5]

array([ 1.41984629,  0.25818437, -0.78979829,  0.69114415,  0.58610681])

bins = pd.qcut(draws,4)
bins

[(0.714, 3.115], (0.0138, 0.714], (-2.7239999999999998, -0.658], (0.0138, 0.714], (0.0138, 0.714], ..., (-2.7239999999999998, -0.658], (0.714, 3.115], (0.0138, 0.714], (0.0138, 0.714], (0.0138, 0.714]]
Length: 1000
Categories (4, interval[float64]): [(-2.7239999999999998, -0.658] < (-0.658, 0.0138] < (0.0138, 0.714] < (0.714, 3.115]]

bins = pd.qcut(draws,4,labels=['Q1','Q2','Q3','Q4'])
bins

[Q4, Q3, Q1, Q3, Q3, ..., Q1, Q4, Q3, Q3, Q3]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

bins = pd.Series(bins,name='quartile')
results = (pd.Series(draws).groupby(bins).agg(['count','min','max']).reset_index())
results

	quartile	count	min	max
0	Q1	250	-2.722817	-0.669126
1	Q2	250	-0.654161	0.011138
2	Q3	250	0.016389	0.713528
3	Q4	250	0.714217	3.115205

#用分类提高性能
N = 100000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo','bar','baz','qux']*(N//4))
categories = labels.astype('category')
labels.memory_usage()#占用内存

categories.memory_usage()

#分类方法
s = pd.Series(['a','b','c','d']*2)
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

actual_categories = ['a','b','c','d','e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): [a, b, c, d, e]

cat_s2.value_counts()

d    2
c    2
b    2
a    2
e    0
dtype: int64

#为建模创建虚拟变量
cat_s = pd.Series(['a','b','c','d']*2,dtype='category')
pd.get_dummies(cat_s)

	a	b	c	d
0	1	0	0	0
1	0	1	0	0
2	0	0	1	0
3	0	0	0	1
4	1	0	0	0
5	0	1	0	0
6	0	0	1	0
7	0	0	0	1

#GroupBy高级应用

#分组转换和解封
df = pd.DataFrame({'key':['a','b','c']*4,
                  'value':np.arange(12.)})
df

	key	value
0	a	0.0
1	b	1.0
2	c	2.0
3	a	3.0
4	b	4.0
5	c	5.0
6	a	6.0
7	b	7.0
8	c	8.0
9	a	9.0
10	b	10.0
11	c	11.0

g = df.groupby('key').value
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

g.transform(lambda x:x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

g.transform(lambda x:x*2)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

g.transform(lambda x:x.rank(ascending=False))

0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

#分组的时间重采样
N = 15
times = pd.date_range('2017-05-20 00:00',freq='1min',periods=N)
df = pd.DataFrame({'time':times,
                  'values':np.arange(N)})
df

	time	values
0	2017-05-20 00:00:00	0
1	2017-05-20 00:01:00	1
2	2017-05-20 00:02:00	2
3	2017-05-20 00:03:00	3
4	2017-05-20 00:04:00	4
5	2017-05-20 00:05:00	5
6	2017-05-20 00:06:00	6
7	2017-05-20 00:07:00	7
8	2017-05-20 00:08:00	8
9	2017-05-20 00:09:00	9
10	2017-05-20 00:10:00	10
11	2017-05-20 00:11:00	11
12	2017-05-20 00:12:00	12
13	2017-05-20 00:13:00	13
14	2017-05-20 00:14:00	14

df.set_index('time').resample('5min').count()

	values
time
2017-05-20 00:00:00	5
2017-05-20 00:05:00	5
2017-05-20 00:10:00	5

df2 = pd.DataFrame({'time': times.repeat(3),
                     'key': np.tile(['a', 'b', 'c'], N),
                        'value': np.arange(N * 3.)})
df2

	time	key	value
0	2017-05-20 00:00:00	a	0.0
1	2017-05-20 00:00:00	b	1.0
2	2017-05-20 00:00:00	c	2.0
3	2017-05-20 00:01:00	a	3.0
4	2017-05-20 00:01:00	b	4.0
5	2017-05-20 00:01:00	c	5.0
6	2017-05-20 00:02:00	a	6.0
7	2017-05-20 00:02:00	b	7.0
8	2017-05-20 00:02:00	c	8.0
9	2017-05-20 00:03:00	a	9.0
10	2017-05-20 00:03:00	b	10.0
11	2017-05-20 00:03:00	c	11.0
12	2017-05-20 00:04:00	a	12.0
13	2017-05-20 00:04:00	b	13.0
14	2017-05-20 00:04:00	c	14.0
15	2017-05-20 00:05:00	a	15.0
16	2017-05-20 00:05:00	b	16.0
17	2017-05-20 00:05:00	c	17.0
18	2017-05-20 00:06:00	a	18.0
19	2017-05-20 00:06:00	b	19.0
20	2017-05-20 00:06:00	c	20.0
21	2017-05-20 00:07:00	a	21.0
22	2017-05-20 00:07:00	b	22.0
23	2017-05-20 00:07:00	c	23.0
24	2017-05-20 00:08:00	a	24.0
25	2017-05-20 00:08:00	b	25.0
26	2017-05-20 00:08:00	c	26.0
27	2017-05-20 00:09:00	a	27.0
28	2017-05-20 00:09:00	b	28.0
29	2017-05-20 00:09:00	c	29.0
30	2017-05-20 00:10:00	a	30.0
31	2017-05-20 00:10:00	b	31.0
32	2017-05-20 00:10:00	c	32.0
33	2017-05-20 00:11:00	a	33.0
34	2017-05-20 00:11:00	b	34.0
35	2017-05-20 00:11:00	c	35.0
36	2017-05-20 00:12:00	a	36.0
37	2017-05-20 00:12:00	b	37.0
38	2017-05-20 00:12:00	c	38.0
39	2017-05-20 00:13:00	a	39.0
40	2017-05-20 00:13:00	b	40.0
41	2017-05-20 00:13:00	c	41.0
42	2017-05-20 00:14:00	a	42.0
43	2017-05-20 00:14:00	b	43.0
44	2017-05-20 00:14:00	c	44.0

time_key = pd.TimeGrouper('5min')
resampled = (df2.set_index('time').groupby(['key',time_key]).sum())
resampled

C:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: pd.TimeGrouper is deprecated and will be removed; Please use pd.Grouper(freq=...)
  """Entry point for launching an IPython kernel.

		value
key	time
a	2017-05-20 00:00:00	30.0
	2017-05-20 00:05:00	105.0
	2017-05-20 00:10:00	180.0
b	2017-05-20 00:00:00	35.0
	2017-05-20 00:05:00	110.0
	2017-05-20 00:10:00	185.0
c	2017-05-20 00:00:00	40.0
	2017-05-20 00:05:00	115.0
	2017-05-20 00:10:00	190.0

猜你喜欢