pandas_learning
import numpy as np
import pandas as pd
一、生成对象
s = pd. Series( [ 1 , 3 , 5 , np. nan, 6 , 8 ] )
print ( s)
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
dates = pd. date_range( '20130101' , periods= 6 )
print ( dates)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd. DataFrame( np. random. randn( 6 , 4 ) , index= dates, columns= list ( 'ABCD' ) )
print ( df)
A B C D
2013-01-01 -0.490432 1.341003 -0.350064 -1.128517
2013-01-02 2.097826 -0.184385 0.701016 0.110131
2013-01-03 0.626084 1.155184 -0.340739 1.792840
2013-01-04 0.418219 -0.551569 0.878323 -0.454765
2013-01-05 0.342240 -1.456387 1.389543 -1.012628
2013-01-06 -1.037013 -1.308806 -0.414125 2.011082
df2 = pd. DataFrame( {
'A' : 1. ,
'B' : pd. Timestamp( '20130102' ) ,
'C' : pd. Series( 1 , index= list ( range ( 4 ) ) , dtype= 'float32' ) ,
'D' : np. array( [ 3 ] * 4 , dtype= 'int32' ) ,
'E' : pd. Categorical( [ "test" , "train" , "test" , "train" ] ) ,
'F' : 'foo' } )
print ( df2)
print ( '\n' )
df2
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
A
B
C
D
E
F
0
1.0
2013-01-02
1.0
3
test
foo
1
1.0
2013-01-02
1.0
3
train
foo
2
1.0
2013-01-02
1.0
3
test
foo
3
1.0
2013-01-02
1.0
3
train
foo
df2. dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
二、查看数据
df. head( )
df. tail( 3 )
A
B
C
D
2013-01-04
0.418219
-0.551569
0.878323
-0.454765
2013-01-05
0.342240
-1.456387
1.389543
-1.012628
2013-01-06
-1.037013
-1.308806
-0.414125
2.011082
df. index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
df. columns
Index(['A', 'B', 'C', 'D'], dtype='object')
DataFrame.to_numpy() 输出底层数据的 NumPy 对象。注意,DataFrame 的列由多种数据类型组成时,该操作耗费系统资源较大,这也是 Pandas 和 NumPy 的本质区别:NumPy 数组只有一种数据类型,DataFrame 每列的数据类型各不相同。调用 DataFrame.to_numpy() 时,Pandas 查找支持 DataFrame 里所有数据类型的 NumPy 数据类型。还有一种数据类型是 object,可以把 DataFrame 列里的值强制转换为 Python 对象。
df. to_numpy( )
array([[-0.49043231, 1.34100341, -0.35006351, -1.12851705],
[ 2.09782571, -0.18438459, 0.70101634, 0.11013148],
[ 0.62608362, 1.15518436, -0.34073901, 1.79283954],
[ 0.41821905, -0.5515692 , 0.8783226 , -0.45476542],
[ 0.34224006, -1.4563874 , 1.38954313, -1.01262842],
[-1.03701303, -1.30880558, -0.41412501, 2.01108196]])
df. describe( )
A
B
C
D
count
6.000000
6.000000
6.000000
6.000000
mean
0.326154
-0.167493
0.310659
0.219690
std
1.073550
1.194712
0.777791
1.377465
min
-1.037013
-1.456387
-0.414125
-1.128517
25%
-0.282264
-1.119496
-0.347732
-0.873163
50%
0.380230
-0.367977
0.180139
-0.172317
75%
0.574117
0.820292
0.833996
1.372163
max
2.097826
1.341003
1.389543
2.011082
df. T
2013-01-01
2013-01-02
2013-01-03
2013-01-04
2013-01-05
2013-01-06
A
-0.490432
2.097826
0.626084
0.418219
0.342240
-1.037013
B
1.341003
-0.184385
1.155184
-0.551569
-1.456387
-1.308806
C
-0.350064
0.701016
-0.340739
0.878323
1.389543
-0.414125
D
-1.128517
0.110131
1.792840
-0.454765
-1.012628
2.011082
df. sort_index( axis= 1 , ascending= False )
df. sort_index( axis= 1 , ascending = True )
A
B
C
D
2013-01-01
-0.490432
1.341003
-0.350064
-1.128517
2013-01-02
2.097826
-0.184385
0.701016
0.110131
2013-01-03
0.626084
1.155184
-0.340739
1.792840
2013-01-04
0.418219
-0.551569
0.878323
-0.454765
2013-01-05
0.342240
-1.456387
1.389543
-1.012628
2013-01-06
-1.037013
-1.308806
-0.414125
2.011082
df. sort_values( by= 'B' )
A
B
C
D
2013-01-05
0.342240
-1.456387
1.389543
-1.012628
2013-01-06
-1.037013
-1.308806
-0.414125
2.011082
2013-01-04
0.418219
-0.551569
0.878323
-0.454765
2013-01-02
2.097826
-0.184385
0.701016
0.110131
2013-01-03
0.626084
1.155184
-0.340739
1.792840
2013-01-01
-0.490432
1.341003
-0.350064
-1.128517
三、选择
选择、设置标准 Python / Numpy 的表达式已经非常直观,交互也很方便,但对于生产代码,我们还是推荐优化过的 Pandas 数据访问方法:.at、.iat、.loc 和 .iloc
(一)获取数据
df[ 'A' ]
df. A
2013-01-01 -0.490432
2013-01-02 2.097826
2013-01-03 0.626084
2013-01-04 0.418219
2013-01-05 0.342240
2013-01-06 -1.037013
Freq: D, Name: A, dtype: float64
print ( df[ 0 : 3 ] )
print ( '\n' )
print ( df[ '20130102' : '20130104' ] )
A B C D
2013-01-01 -0.490432 1.341003 -0.350064 -1.128517
2013-01-02 2.097826 -0.184385 0.701016 0.110131
2013-01-03 0.626084 1.155184 -0.340739 1.792840
A B C D
2013-01-02 2.097826 -0.184385 0.701016 0.110131
2013-01-03 0.626084 1.155184 -0.340739 1.792840
2013-01-04 0.418219 -0.551569 0.878323 -0.454765
(二)按照标签选择
df. loc[ dates[ 0 ] ]
A -0.490432
B 1.341003
C -0.350064
D -1.128517
Name: 2013-01-01 00:00:00, dtype: float64
df. loc[ : , [ 'A' , 'B' ] ]
A
B
2013-01-01
-0.490432
1.341003
2013-01-02
2.097826
-0.184385
2013-01-03
0.626084
1.155184
2013-01-04
0.418219
-0.551569
2013-01-05
0.342240
-1.456387
2013-01-06
-1.037013
-1.308806
df. loc[ '20130102' : '20130104' , [ 'A' , 'B' ] ]
A
B
2013-01-02
2.097826
-0.184385
2013-01-03
0.626084
1.155184
2013-01-04
0.418219
-0.551569
df. loc[ '20130102' , [ 'A' , 'B' ] ]
A 2.097826
B -0.184385
Name: 2013-01-02 00:00:00, dtype: float64
df. loc[ dates[ 0 ] , 'A' ]
df. loc[ '20130101' , 'A' ]
-0.490432312502826
(三)按位置选择
df. iloc[ 3 ]
A 0.418219
B -0.551569
C 0.878323
D -0.454765
Name: 2013-01-04 00:00:00, dtype: float64
df. iloc[ 3 : 5 , 0 : 2 ]
A
B
2013-01-04
0.418219
-0.551569
2013-01-05
0.342240
-1.456387
df. iloc[ [ 1 , 2 , 4 ] , [ 0 , 2 ] ]
A
C
2013-01-02
2.097826
0.701016
2013-01-03
0.626084
-0.340739
2013-01-05
0.342240
1.389543
df. iloc[ 1 : 3 , : ]
A
B
C
D
2013-01-02
2.097826
-0.184385
0.701016
0.110131
2013-01-03
0.626084
1.155184
-0.340739
1.792840
df. iloc[ : , 1 : 3 ]
B
C
2013-01-01
1.341003
-0.350064
2013-01-02
-0.184385
0.701016
2013-01-03
1.155184
-0.340739
2013-01-04
-0.551569
0.878323
2013-01-05
-1.456387
1.389543
2013-01-06
-1.308806
-0.414125
df. iloc[ 1 , 1 ]
df. iat[ 1 , 1 ]
-0.18438458703585694
(四)布尔索引
df[ df. B > 0 ]
A
B
C
D
2013-01-01
-0.490432
1.341003
-0.350064
-1.128517
2013-01-03
0.626084
1.155184
-0.340739
1.792840
df[ df > 0 ]
A
B
C
D
2013-01-01
NaN
1.341003
NaN
NaN
2013-01-02
2.097826
NaN
0.701016
0.110131
2013-01-03
0.626084
1.155184
NaN
1.792840
2013-01-04
0.418219
NaN
0.878323
NaN
2013-01-05
0.342240
NaN
1.389543
NaN
2013-01-06
NaN
NaN
NaN
2.011082
df2 = df. copy( )
df2[ 'E' ] = [ 'one' , 'one' , 'two' , 'three' , 'four' , 'three' ]
df2[ 'time' ] = [ '1st' , '2nd' , '3rd' , '4th' , '5th' , '6th' ]
print ( df2)
df2[ df2[ 'E' ] . isin( [ 'two' , 'four' ] ) ]
A B C D E time
2013-01-01 -0.490432 1.341003 -0.350064 -1.128517 one 1st
2013-01-02 2.097826 -0.184385 0.701016 0.110131 one 2nd
2013-01-03 0.626084 1.155184 -0.340739 1.792840 two 3rd
2013-01-04 0.418219 -0.551569 0.878323 -0.454765 three 4th
2013-01-05 0.342240 -1.456387 1.389543 -1.012628 four 5th
2013-01-06 -1.037013 -1.308806 -0.414125 2.011082 three 6th
A
B
C
D
E
time
2013-01-03
0.626084
1.155184
-0.340739
1.792840
two
3rd
2013-01-05
0.342240
-1.456387
1.389543
-1.012628
four
5th
(五)赋值
s1 = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 ] , index= pd. date_range( '20130102' , periods= 6 ) )
s1
2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
df. at[ dates[ 0 ] , 'A' ] = 0
df
A
B
C
D
2013-01-01
0.000000
1.341003
-0.350064
-1.128517
2013-01-02
2.097826
-0.184385
0.701016
0.110131
2013-01-03
0.626084
1.155184
-0.340739
1.792840
2013-01-04
0.418219
-0.551569
0.878323
-0.454765
2013-01-05
0.342240
-1.456387
1.389543
-1.012628
2013-01-06
-1.037013
-1.308806
-0.414125
2.011082
df. iat[ 0 , 1 ] = 0
df
A
B
C
D
2013-01-01
0.000000
0.000000
-0.350064
-1.128517
2013-01-02
2.097826
-0.184385
0.701016
0.110131
2013-01-03
0.626084
1.155184
-0.340739
1.792840
2013-01-04
0.418219
-0.551569
0.878323
-0.454765
2013-01-05
0.342240
-1.456387
1.389543
-1.012628
2013-01-06
-1.037013
-1.308806
-0.414125
2.011082
df. loc[ : , 'D' ] = np. array( [ 5 ] * len ( df) )
df
C:\Users\Allen\AppData\Local\Temp\ipykernel_13872\531792459.py:2: DeprecationWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
df.loc[:, 'D'] = np.array([5] * len(df))
A
B
C
D
2013-01-01
0.000000
0.000000
-0.350064
5
2013-01-02
2.097826
-0.184385
0.701016
5
2013-01-03
0.626084
1.155184
-0.340739
5
2013-01-04
0.418219
-0.551569
0.878323
5
2013-01-05
0.342240
-1.456387
1.389543
5
2013-01-06
-1.037013
-1.308806
-0.414125
5
df2 = df. copy( )
print ( df2)
df2[ df2 > 0 ] = - df2
df2
A B C D
2013-01-01 0.000000 0.000000 -0.350064 5
2013-01-02 2.097826 -0.184385 0.701016 5
2013-01-03 0.626084 1.155184 -0.340739 5
2013-01-04 0.418219 -0.551569 0.878323 5
2013-01-05 0.342240 -1.456387 1.389543 5
2013-01-06 -1.037013 -1.308806 -0.414125 5
A
B
C
D
2013-01-01
0.000000
0.000000
-0.350064
-5
2013-01-02
-2.097826
-0.184385
-0.701016
-5
2013-01-03
-0.626084
-1.155184
-0.340739
-5
2013-01-04
-0.418219
-0.551569
-0.878323
-5
2013-01-05
-0.342240
-1.456387
-1.389543
-5
2013-01-06
-1.037013
-1.308806
-0.414125
-5
四、缺失值
Pandas 主要用 np.nan 表示缺失数据。 计算时,默认不包含空值。详见缺失数据。
df1 = df. reindex( index= dates[ 0 : 4 ] , columns= list ( df. columns) + [ 'E' ] )
df1. loc[ dates[ 0 ] : dates[ 1 ] , 'E' ] = 1
df1
A
B
C
D
E
2013-01-01
0.000000
0.000000
-0.350064
5
1.0
2013-01-02
2.097826
-0.184385
0.701016
5
1.0
2013-01-03
0.626084
1.155184
-0.340739
5
NaN
2013-01-04
0.418219
-0.551569
0.878323
5
NaN
df1. dropna( how= 'any' )
A
B
C
D
E
2013-01-01
0.000000
0.000000
-0.350064
5
1.0
2013-01-02
2.097826
-0.184385
0.701016
5
1.0
df1. fillna( value= 5 )
A
B
C
D
E
2013-01-01
0.000000
0.000000
-0.350064
5
1.0
2013-01-02
2.097826
-0.184385
0.701016
5
1.0
2013-01-03
0.626084
1.155184
-0.340739
5
5.0
2013-01-04
0.418219
-0.551569
0.878323
5
5.0
pd. isna( df1)
A
B
C
D
E
2013-01-01
False
False
False
False
False
2013-01-02
False
False
False
False
False
2013-01-03
False
False
False
False
True
2013-01-04
False
False
False
False
True
五、运算
(一)统计
df. mean( )
A 0.407893
B -0.390994
C 0.310659
D 5.000000
dtype: float64
print ( df)
df. mean( 1 )
A B C D
2013-01-01 0.000000 0.000000 -0.350064 5
2013-01-02 2.097826 -0.184385 0.701016 5
2013-01-03 0.626084 1.155184 -0.340739 5
2013-01-04 0.418219 -0.551569 0.878323 5
2013-01-05 0.342240 -1.456387 1.389543 5
2013-01-06 -1.037013 -1.308806 -0.414125 5
2013-01-01 1.162484
2013-01-02 1.903614
2013-01-03 1.610132
2013-01-04 1.436243
2013-01-05 1.318849
2013-01-06 0.560014
Freq: D, dtype: float64
s = pd. Series( [ 1 , 3 , 5 , np. nan, 6 , 8 ] , index= dates) . shift( 2 )
s
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
(二)Apply函数
df. apply ( np. cumsum)
A
B
C
D
2013-01-01
0.000000
0.000000
-0.350064
5
2013-01-02
2.097826
-0.184385
0.350953
10
2013-01-03
2.723909
0.970800
0.010214
15
2013-01-04
3.142128
0.419231
0.888536
20
2013-01-05
3.484368
-1.037157
2.278080
25
2013-01-06
2.447355
-2.345962
1.863955
30
(三)直方图
s = pd. Series( np. random. randint( 0 , 7 , size= 10 ) )
s
0 1
1 3
2 4
3 5
4 4
5 2
6 1
7 2
8 0
9 0
dtype: int32
s. value_counts( )
1 2
4 2
2 2
0 2
3 1
5 1
dtype: int64
(四)字符串方法
s = pd. Series( [ 'A' , 'B' , 'C' , 'Aaba' , 'Baca' , np. nan, 'CABA' , 'dog' , 'cat' ] )
s. str . lower( )
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
五、合并
(一)结合(concat)
Pandas 提供了多种将 Series、DataFrame 对象组合在一起的功能,用索引与关联代数功能的多种设置逻辑可执行连接(join)与合并(merge)操作。
df = pd. DataFrame( np. random. randn( 10 , 4 ) )
df
0
1
2
3
0
-1.920086
0.367632
2.497282
0.066091
1
-0.604439
1.692157
-0.805864
1.755445
2
-0.585767
0.946251
0.196929
1.496892
3
-0.200327
-0.574453
-0.255195
-0.317823
4
1.273022
0.874491
1.619386
1.956572
5
-1.045788
2.007894
0.113608
-0.576531
6
0.947761
1.959843
1.525449
-0.859118
7
0.753344
0.290895
-1.511461
1.335855
8
-2.731952
0.122681
-1.543743
-1.982785
9
0.095608
-2.362279
-0.386469
1.895013
pieces = [ df[ : 3 ] , df[ 3 : 7 ] , df[ 7 : ] ]
pieces
[ 0 1 2 3
0 -1.920086 0.367632 2.497282 0.066091
1 -0.604439 1.692157 -0.805864 1.755445
2 -0.585767 0.946251 0.196929 1.496892,
0 1 2 3
3 -0.200327 -0.574453 -0.255195 -0.317823
4 1.273022 0.874491 1.619386 1.956572
5 -1.045788 2.007894 0.113608 -0.576531
6 0.947761 1.959843 1.525449 -0.859118,
0 1 2 3
7 0.753344 0.290895 -1.511461 1.335855
8 -2.731952 0.122681 -1.543743 -1.982785
9 0.095608 -2.362279 -0.386469 1.895013]
pd. concat( pieces)
0
1
2
3
0
-1.920086
0.367632
2.497282
0.066091
1
-0.604439
1.692157
-0.805864
1.755445
2
-0.585767
0.946251
0.196929
1.496892
3
-0.200327
-0.574453
-0.255195
-0.317823
4
1.273022
0.874491
1.619386
1.956572
5
-1.045788
2.007894
0.113608
-0.576531
6
0.947761
1.959843
1.525449
-0.859118
7
0.753344
0.290895
-1.511461
1.335855
8
-2.731952
0.122681
-1.543743
-1.982785
9
0.095608
-2.362279
-0.386469
1.895013
(二)连接(join)
1.第一个例子
left = pd. DataFrame( {
'key' : [ 'foo' , 'foo' ] , 'lval' : [ 1 , 2 ] } )
left
right = pd. DataFrame( {
'key' : [ 'foo' , 'foo' ] , 'rval' : [ 4 , 5 ] } )
right
pd. merge( left, right, on= 'key' )
key
lval
rval
0
foo
1
4
1
foo
1
5
2
foo
2
4
3
foo
2
5
2.第二个例子
left = pd. DataFrame( {
'key' : [ 'foo' , 'bar' ] , 'lval' : [ 1 , 2 ] } )
left
right = pd. DataFrame( {
'key' : [ 'foo' , 'bar' ] , 'rval' : [ 4 , 5 ] } )
right
pd. merge( left, right, on= 'key' )
key
lval
rval
0
foo
1
4
1
bar
2
5
(三)结合(append)
df = pd. DataFrame( np. random. randn( 8 , 4 ) , columns= [ 'A' , 'B' , 'C' , 'D' ] )
df
A
B
C
D
0
1.977216
0.809690
-1.092985
1.094562
1
-0.537330
0.119068
-0.270291
0.583295
2
-0.899852
-0.013189
-0.701051
-0.075777
3
-0.676315
-0.388863
-0.317767
0.200017
4
2.046481
-0.059868
-0.043779
0.073719
5
0.553642
0.585511
-0.363557
1.353199
6
-0.038520
0.867621
1.283126
0.359301
7
1.487298
-1.967855
0.781338
-0.356828
s = df. iloc[ 3 ]
s
A -0.676315
B -0.388863
C -0.317767
D 0.200017
Name: 3, dtype: float64
df. append( s, ignore_index= True )
C:\Users\Allen\AppData\Local\Temp\ipykernel_13872\4011806271.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df.append(s, ignore_index=True)
A
B
C
D
0
1.977216
0.809690
-1.092985
1.094562
1
-0.537330
0.119068
-0.270291
0.583295
2
-0.899852
-0.013189
-0.701051
-0.075777
3
-0.676315
-0.388863
-0.317767
0.200017
4
2.046481
-0.059868
-0.043779
0.073719
5
0.553642
0.585511
-0.363557
1.353199
6
-0.038520
0.867621
1.283126
0.359301
7
1.487298
-1.967855
0.781338
-0.356828
8
-0.676315
-0.388863
-0.317767
0.200017
六、分组(group)
df = pd. DataFrame( {
'A' : [ 'foo' , 'bar' , 'foo' , 'bar' , 'foo' , 'bar' , 'foo' , 'foo' ] ,
'B' : [ 'one' , 'one' , 'two' , 'three' , 'two' , 'two' , 'one' , 'three' ] ,
'C' : np. random. randn( 8 ) ,
'D' : np. random. randn( 8 ) } )
df
A
B
C
D
0
foo
one
0.110792
1.046919
1
bar
one
0.257225
-1.405743
2
foo
two
-1.029414
-0.631213
3
bar
three
-0.696101
-0.153206
4
foo
two
0.655852
0.016673
5
bar
two
0.454345
-0.839135
6
foo
one
-1.270142
-2.196781
7
foo
three
0.491435
0.502373
df. groupby( 'A' ) . sum ( )
C:\Users\Allen\AppData\Local\Temp\ipykernel_13872\408658545.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
df.groupby('A').sum()
C
D
A
bar
0.015469
-2.398085
foo
-1.041477
-1.262029
df. groupby( [ 'A' , 'B' ] ) . sum ( )
C
D
A
B
bar
one
0.257225
-1.405743
three
-0.696101
-0.153206
two
0.454345
-0.839135
foo
one
-1.159350
-1.149862
three
0.491435
0.502373
two
-0.373561
-0.614540
七、重塑
(一)堆叠
tuples = list ( zip ( * [ [ 'bar' , 'bar' , 'baz' , 'baz' ,
'foo' , 'foo' , 'qux' , 'qux' ] ,
[ 'one' , 'two' , 'one' , 'two' ,
'one' , 'two' , 'one' , 'two' ] ] ) )
tuples
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
index = pd. MultiIndex. from_tuples( tuples, names= [ 'first' , 'second' ] )
df = pd. DataFrame( np. random. randn( 8 , 2 ) , index= index, columns= [ 'A' , 'B' ] )
df2 = df[ : 4 ]
df2
A
B
first
second
bar
one
0.293173
0.076621
two
0.504254
0.005360
baz
one
-0.520367
-0.118466
two
-0.280027
1.871664
stacked = df2. stack( )
stacked
first second
bar one A 0.293173
B 0.076621
two A 0.504254
B 0.005360
baz one A -0.520367
B -0.118466
two A -0.280027
B 1.871664
dtype: float64
压缩后的 DataFrame 或 Series 具有多层索引, stack() 的逆操作是 unstack(),默认为拆叠最后一层:
stacked. unstack( )
A
B
first
second
bar
one
0.293173
0.076621
two
0.504254
0.005360
baz
one
-0.520367
-0.118466
two
-0.280027
1.871664
stacked. unstack( 1 )
second
one
two
first
bar
A
0.293173
0.504254
B
0.076621
0.005360
baz
A
-0.520367
-0.280027
B
-0.118466
1.871664
八、数据透视表
df = pd. DataFrame( {
'A' : [ 'one' , 'one' , 'two' , 'three' ] * 3 ,
'B' : [ 'A' , 'B' , 'C' ] * 4 ,
'C' : [ 'foo' , 'foo' , 'foo' , 'bar' , 'bar' , 'bar' ] * 2 ,
'D' : np. random. randn( 12 ) ,
'E' : np. random. randn( 12 ) } )
df
A
B
C
D
E
0
one
A
foo
1.768843
-2.404768
1
one
B
foo
-1.296745
1.205640
2
two
C
foo
-0.374749
-0.545797
3
three
A
bar
-0.504192
1.104782
4
one
B
bar
-0.893779
-0.056505
5
one
C
bar
-0.671526
-1.454542
6
two
A
foo
1.234484
1.087237
7
three
B
foo
0.316961
-1.333247
8
one
C
foo
1.342511
-0.712946
9
one
A
bar
-0.263245
-0.278933
10
two
B
bar
-2.146075
-0.725574
11
three
C
bar
1.257256
1.293410
pd. pivot_table( df, values= 'D' , index= [ 'A' , 'B' ] , columns= [ 'C' ] )
C
bar
foo
A
B
one
A
-0.263245
1.768843
B
-0.893779
-1.296745
C
-0.671526
1.342511
three
A
-0.504192
NaN
B
NaN
0.316961
C
1.257256
NaN
two
A
NaN
1.234484
B
-2.146075
NaN
C
NaN
-0.374749
九、时间序列
Pandas 为频率转换时重采样提供了虽然简单易用,但强大高效的功能,如,将秒级的数据转换为 5 分钟为频率的数据。这种操作常见于财务应用程序,但又不仅限于此。详见时间序列
rng = pd. date_range( '1/1/2012' , periods= 100 , freq= 'S' )
ts = pd. Series( np. random. randint( 0 , 500 , len ( rng) ) , index= rng)
ts. resample( '5Min' ) . sum ( )
2012-01-01 24914
Freq: 5T, dtype: int32
rng = pd. date_range( '3/6/2012 00:00' , periods= 5 , freq= 'D' )
ts = pd. Series( np. random. randn( len ( rng) ) , rng)
ts
2012-03-06 -0.095941
2012-03-07 0.152061
2012-03-08 0.123898
2012-03-09 -0.291567
2012-03-10 -1.162841
Freq: D, dtype: float64
ts_utc = ts. tz_localize( 'UTC' )
ts_utc
2012-03-06 00:00:00+00:00 -0.095941
2012-03-07 00:00:00+00:00 0.152061
2012-03-08 00:00:00+00:00 0.123898
2012-03-09 00:00:00+00:00 -0.291567
2012-03-10 00:00:00+00:00 -1.162841
Freq: D, dtype: float64
ts_utc. tz_convert( 'US/Eastern' )
2012-03-05 19:00:00-05:00 -0.095941
2012-03-06 19:00:00-05:00 0.152061
2012-03-07 19:00:00-05:00 0.123898
2012-03-08 19:00:00-05:00 -0.291567
2012-03-09 19:00:00-05:00 -1.162841
Freq: D, dtype: float64
rng = pd. date_range( '1/1/2012' , periods= 5 , freq= 'M' )
ts = pd. Series( np. random. randn( len ( rng) ) , index= rng)
ts
2012-01-31 -0.688979
2012-02-29 -0.748789
2012-03-31 -0.703756
2012-04-30 -0.933271
2012-05-31 0.548897
Freq: M, dtype: float64
ps = ts. to_period( )
ps
2012-01 -0.688979
2012-02 -0.748789
2012-03 -0.703756
2012-04 -0.933271
2012-05 0.548897
Freq: M, dtype: float64
ps. to_timestamp( )
2012-01-01 -0.688979
2012-02-01 -0.748789
2012-03-01 -0.703756
2012-04-01 -0.933271
2012-05-01 0.548897
Freq: MS, dtype: float64
prng = pd. period_range( '1990Q1' , '2000Q4' , freq= 'Q-NOV' )
ts = pd. Series( np. random. randn( len ( prng) ) , prng)
ts. index = ( prng. asfreq( 'M' , 'e' ) + 1 ) . asfreq( 'H' , 's' ) + 9
ts. head( )
1990-03-01 09:00 -0.343270
1990-06-01 09:00 0.108602
1990-09-01 09:00 -1.729255
1990-12-01 09:00 2.244965
1991-03-01 09:00 1.186678
Freq: H, dtype: float64
十、类别型
Pandas 的 DataFrame 里可以包含类别数据。完整文档详见类别简介 和 API 文档。
df = pd. DataFrame( {
"id" : [ 1 , 2 , 3 , 4 , 5 , 6 ] ,
"raw_grade" : [ 'a' , 'b' , 'b' , 'a' , 'a' , 'e' ] } )
df[ "grade" ] = df[ "raw_grade" ] . astype( "category" )
df[ "grade" ]
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']
df[ "grade" ] . cat. categories = [ "very good" , "good" , "very bad" ]
C:\Users\Allen\AppData\Local\Temp\ipykernel_13872\861203465.py:2: FutureWarning: Setting categories in-place is deprecated and will raise in a future version. Use rename_categories instead.
df["grade"].cat.categories = ["very good", "good", "very bad"]
df[ "grade" ] = df[ "grade" ] . cat. set_categories( [ "very bad" , "bad" , "medium" ,
"good" , "very good" ] )
df[ "grade" ]
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']
df. sort_values( by= "grade" )
id
raw_grade
grade
5
6
e
very bad
1
2
b
good
2
3
b
good
0
1
a
very good
3
4
a
very good
4
5
a
very good
df. groupby( "grade" ) . size( )
grade
very bad 1
bad 0
medium 0
good 2
very good 3
dtype: int64
十一、可视化
ts = pd. Series( np. random. randn( 1000 ) ,
index= pd. date_range( '1/1/2000' , periods= 1000 ) )
ts = ts. cumsum( )
ts. plot( )
<AxesSubplot:>
df = pd. DataFrame( np. random. randn( 1000 , 4 ) , index= ts. index,
columns= [ 'A' , 'B' , 'C' , 'D' ] )
df = df. cumsum( )
df. plot( )
<AxesSubplot:>
十二、数据输入/输出
df. to_csv( 'foo.csv' )
pd. read_csv( 'foo.csv' )
Unnamed: 0
A
B
C
D
0
2000-01-01
-0.196217
-0.567679
-0.017399
-0.759675
1
2000-01-02
-0.992873
1.045825
0.751507
0.135319
2
2000-01-03
-1.139351
0.485977
1.263285
-0.954968
3
2000-01-04
-2.328024
0.945480
1.115053
-1.166374
4
2000-01-05
-4.682187
-1.138483
0.378853
-1.588296
...
...
...
...
...
...
995
2002-09-22
-22.127260
-56.337832
14.308220
-37.997555
996
2002-09-23
-22.683999
-56.727513
14.728768
-36.391624
997
2002-09-24
-24.029579
-55.935110
14.212392
-36.302242
998
2002-09-25
-25.639169
-54.762663
13.965459
-35.999471
999
2002-09-26
-26.532594
-54.212622
14.433300
-34.411454
1000 rows × 5 columns
df. to_excel( 'foo.xlsx' , sheet_name= 'Sheet1' )
pd. read_excel( 'foo.xlsx' , 'Sheet1' , index_col= None , na_values= [ 'NA' ] )
Unnamed: 0
A
B
C
D
0
2000-01-01
-0.196217
-0.567679
-0.017399
-0.759675
1
2000-01-02
-0.992873
1.045825
0.751507
0.135319
2
2000-01-03
-1.139351
0.485977
1.263285
-0.954968
3
2000-01-04
-2.328024
0.945480
1.115053
-1.166374
4
2000-01-05
-4.682187
-1.138483
0.378853
-1.588296
...
...
...
...
...
...
995
2002-09-22
-22.127260
-56.337832
14.308220
-37.997555
996
2002-09-23
-22.683999
-56.727513
14.728768
-36.391624
997
2002-09-24
-24.029579
-55.935110
14.212392
-36.302242
998
2002-09-25
-25.639169
-54.762663
13.965459
-35.999471
999
2002-09-26
-26.532594
-54.212622
14.433300
-34.411454
1000 rows × 5 columns