import pandas as pd
import numpy as np
values = pd. Series( [ 'apple' , 'orange' , 'apple' , 'apple' ] * 2 )
values
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
dtype: object
pd. unique( values)
array(['apple', 'orange'], dtype=object)
pd. value_counts( values)
apple 6
orange 2
dtype: int64
values = pd. Series( [ 0 , 1 , 0 , 0 ] * 2 )
dim = pd. Series( [ 'apple' , 'orange' ] )
values
0 0
1 1
2 0
3 0
4 0
5 1
6 0
7 0
dtype: int64
dim
0 apple
1 orange
dtype: object
dim. take( values)
0 apple
1 orange
0 apple
0 apple
0 apple
1 orange
0 apple
0 apple
dtype: object
fruits = [ 'apple' , 'orange' , 'apple' , 'apple' ] * 2
N = len ( fruits)
df = pd. DataFrame( { 'fruits' : fruits,
'basket_id' : np. arange( N) ,
'count' : np. random. randint( 3 , 15 , size= N) ,
'weight' : np. random. uniform( 0 , 4 , size= N) } ,
columns= [ 'basket_id' , 'fruits' , 'count' , 'weight' ] )
df
basket_id
fruits
count
weight
0
0
apple
10
2.679414
1
1
orange
8
2.278047
2
2
apple
9
0.087745
3
3
apple
6
2.028924
4
4
apple
11
1.704697
5
5
orange
6
1.352336
6
6
apple
11
2.940028
7
7
apple
4
2.798046
fruit_cat = df[ 'fruits' ] . astype( 'category' )
fruit_cat
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]
c = fruit_cat. values
type ( c)
pandas.core.arrays.categorical.Categorical
c. categories
Index(['apple', 'orange'], dtype='object')
c. codes
array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
df[ 'fruits' ] = df[ 'fruits' ] . astype( 'category' )
df. fruits
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]
my_categories = pd. Categorical( [ 'foo' , 'bar' , 'baz' , 'foo' , 'bar' ] )
my_categories
[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]
categories = [ 'foo' , 'bar' , 'baz' ]
codes = [ 0 , 1 , 2 , 0 , 0 , 1 ]
my_cats_2 = pd. Categorical. from_codes( codes, categories)
my_cats_2
[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]
draws = np. random. randn( 1000 )
draws[ : 5 ]
array([ 1.41984629, 0.25818437, -0.78979829, 0.69114415, 0.58610681])
bins = pd. qcut( draws, 4 )
bins
[(0.714, 3.115], (0.0138, 0.714], (-2.7239999999999998, -0.658], (0.0138, 0.714], (0.0138, 0.714], ..., (-2.7239999999999998, -0.658], (0.714, 3.115], (0.0138, 0.714], (0.0138, 0.714], (0.0138, 0.714]]
Length: 1000
Categories (4, interval[float64]): [(-2.7239999999999998, -0.658] < (-0.658, 0.0138] < (0.0138, 0.714] < (0.714, 3.115]]
bins = pd. qcut( draws, 4 , labels= [ 'Q1' , 'Q2' , 'Q3' , 'Q4' ] )
bins
[Q4, Q3, Q1, Q3, Q3, ..., Q1, Q4, Q3, Q3, Q3]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]
bins = pd. Series( bins, name= 'quartile' )
results = ( pd. Series( draws) . groupby( bins) . agg( [ 'count' , 'min' , 'max' ] ) . reset_index( ) )
results
quartile
count
min
max
0
Q1
250
-2.722817
-0.669126
1
Q2
250
-0.654161
0.011138
2
Q3
250
0.016389
0.713528
3
Q4
250
0.714217
3.115205
N = 100000
draws = pd. Series( np. random. randn( N) )
labels = pd. Series( [ 'foo' , 'bar' , 'baz' , 'qux' ] * ( N// 4 ) )
categories = labels. astype( 'category' )
labels. memory_usage( )
800080
categories. memory_usage( )
100272
s = pd. Series( [ 'a' , 'b' , 'c' , 'd' ] * 2 )
cat_s = s. astype( 'category' )
cat_s
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (4, object): [a, b, c, d]
cat_s. cat. codes
0 0
1 1
2 2
3 3
4 0
5 1
6 2
7 3
dtype: int8
cat_s. cat. categories
Index(['a', 'b', 'c', 'd'], dtype='object')
actual_categories = [ 'a' , 'b' , 'c' , 'd' , 'e' ]
cat_s2 = cat_s. cat. set_categories( actual_categories)
cat_s2
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (5, object): [a, b, c, d, e]
cat_s2. value_counts( )
d 2
c 2
b 2
a 2
e 0
dtype: int64
cat_s = pd. Series( [ 'a' , 'b' , 'c' , 'd' ] * 2 , dtype= 'category' )
pd. get_dummies( cat_s)
a
b
c
d
0
1
0
0
0
1
0
1
0
0
2
0
0
1
0
3
0
0
0
1
4
1
0
0
0
5
0
1
0
0
6
0
0
1
0
7
0
0
0
1
df = pd. DataFrame( { 'key' : [ 'a' , 'b' , 'c' ] * 4 ,
'value' : np. arange( 12 . ) } )
df
key
value
0
a
0.0
1
b
1.0
2
c
2.0
3
a
3.0
4
b
4.0
5
c
5.0
6
a
6.0
7
b
7.0
8
c
8.0
9
a
9.0
10
b
10.0
11
c
11.0
g = df. groupby( 'key' ) . value
g. mean( )
key
a 4.5
b 5.5
c 6.5
Name: value, dtype: float64
g. transform( lambda x: x. mean( ) )
0 4.5
1 5.5
2 6.5
3 4.5
4 5.5
5 6.5
6 4.5
7 5.5
8 6.5
9 4.5
10 5.5
11 6.5
Name: value, dtype: float64
g. transform( 'mean' )
0 4.5
1 5.5
2 6.5
3 4.5
4 5.5
5 6.5
6 4.5
7 5.5
8 6.5
9 4.5
10 5.5
11 6.5
Name: value, dtype: float64
g. transform( lambda x: x* 2 )
0 0.0
1 2.0
2 4.0
3 6.0
4 8.0
5 10.0
6 12.0
7 14.0
8 16.0
9 18.0
10 20.0
11 22.0
Name: value, dtype: float64
g. transform( lambda x: x. rank( ascending= False ) )
0 4.0
1 4.0
2 4.0
3 3.0
4 3.0
5 3.0
6 2.0
7 2.0
8 2.0
9 1.0
10 1.0
11 1.0
Name: value, dtype: float64
N = 15
times = pd. date_range( '2017-05-20 00:00' , freq= '1min' , periods= N)
df = pd. DataFrame( { 'time' : times,
'values' : np. arange( N) } )
df
time
values
0
2017-05-20 00:00:00
0
1
2017-05-20 00:01:00
1
2
2017-05-20 00:02:00
2
3
2017-05-20 00:03:00
3
4
2017-05-20 00:04:00
4
5
2017-05-20 00:05:00
5
6
2017-05-20 00:06:00
6
7
2017-05-20 00:07:00
7
8
2017-05-20 00:08:00
8
9
2017-05-20 00:09:00
9
10
2017-05-20 00:10:00
10
11
2017-05-20 00:11:00
11
12
2017-05-20 00:12:00
12
13
2017-05-20 00:13:00
13
14
2017-05-20 00:14:00
14
df. set_index( 'time' ) . resample( '5min' ) . count( )
values
time
2017-05-20 00:00:00
5
2017-05-20 00:05:00
5
2017-05-20 00:10:00
5
df2 = pd. DataFrame( { 'time' : times. repeat( 3 ) ,
'key' : np. tile( [ 'a' , 'b' , 'c' ] , N) ,
'value' : np. arange( N * 3 . ) } )
df2
time
key
value
0
2017-05-20 00:00:00
a
0.0
1
2017-05-20 00:00:00
b
1.0
2
2017-05-20 00:00:00
c
2.0
3
2017-05-20 00:01:00
a
3.0
4
2017-05-20 00:01:00
b
4.0
5
2017-05-20 00:01:00
c
5.0
6
2017-05-20 00:02:00
a
6.0
7
2017-05-20 00:02:00
b
7.0
8
2017-05-20 00:02:00
c
8.0
9
2017-05-20 00:03:00
a
9.0
10
2017-05-20 00:03:00
b
10.0
11
2017-05-20 00:03:00
c
11.0
12
2017-05-20 00:04:00
a
12.0
13
2017-05-20 00:04:00
b
13.0
14
2017-05-20 00:04:00
c
14.0
15
2017-05-20 00:05:00
a
15.0
16
2017-05-20 00:05:00
b
16.0
17
2017-05-20 00:05:00
c
17.0
18
2017-05-20 00:06:00
a
18.0
19
2017-05-20 00:06:00
b
19.0
20
2017-05-20 00:06:00
c
20.0
21
2017-05-20 00:07:00
a
21.0
22
2017-05-20 00:07:00
b
22.0
23
2017-05-20 00:07:00
c
23.0
24
2017-05-20 00:08:00
a
24.0
25
2017-05-20 00:08:00
b
25.0
26
2017-05-20 00:08:00
c
26.0
27
2017-05-20 00:09:00
a
27.0
28
2017-05-20 00:09:00
b
28.0
29
2017-05-20 00:09:00
c
29.0
30
2017-05-20 00:10:00
a
30.0
31
2017-05-20 00:10:00
b
31.0
32
2017-05-20 00:10:00
c
32.0
33
2017-05-20 00:11:00
a
33.0
34
2017-05-20 00:11:00
b
34.0
35
2017-05-20 00:11:00
c
35.0
36
2017-05-20 00:12:00
a
36.0
37
2017-05-20 00:12:00
b
37.0
38
2017-05-20 00:12:00
c
38.0
39
2017-05-20 00:13:00
a
39.0
40
2017-05-20 00:13:00
b
40.0
41
2017-05-20 00:13:00
c
41.0
42
2017-05-20 00:14:00
a
42.0
43
2017-05-20 00:14:00
b
43.0
44
2017-05-20 00:14:00
c
44.0
time_key = pd. TimeGrouper( '5min' )
resampled = ( df2. set_index( 'time' ) . groupby( [ 'key' , time_key] ) . sum ( ) )
resampled
C:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: pd.TimeGrouper is deprecated and will be removed; Please use pd.Grouper(freq=...)
"""Entry point for launching an IPython kernel.
value
key
time
a
2017-05-20 00:00:00
30.0
2017-05-20 00:05:00
105.0
2017-05-20 00:10:00
180.0
b
2017-05-20 00:00:00
35.0
2017-05-20 00:05:00
110.0
2017-05-20 00:10:00
185.0
c
2017-05-20 00:00:00
40.0
2017-05-20 00:05:00
115.0
2017-05-20 00:10:00
190.0