自我学习学习笔记:pandas索引拼接重塑 1、分层索引 2、联合与合并数据集 3、重塑和透视
1分层索引
import pandas as pd
import numpy as np
生成数据
data = pd. Series( np. random. randn( 9 ) , index= [ [ 'a' , 'a' , 'a' , 'b' , 'b' , 'c' , 'c' , 'd' , 'd' ] , [ 1 , 2 , 3 , 1 , 3 , 1 , 2 , 2 , 3 ] ] )
data
a 1 0.269045
2 -0.885298
3 0.158806
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
d 2 -0.088124
3 -1.746533
dtype: float64
data. index
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
分层索引
data[ 'b' ]
1 0.378847
3 0.301012
dtype: float64
data[ 'b' : 'c' ]
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
dtype: float64
data. loc[ [ 'b' , 'c' ] ]
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
dtype: float64
在内部层级中进行选择
data. loc[ : , 2 ]
a -0.885298
c -1.453824
d -0.088124
dtype: float64
使用unstack将数据重新排列
data. unstack( )
1
2
3
a
0.269045
-0.885298
0.158806
b
0.378847
NaN
0.301012
c
-1.255018
-1.453824
NaN
d
NaN
-0.088124
-1.746533
unstack的反操作为stack
data. unstack( ) . stack( )
a 1 0.269045
2 -0.885298
3 0.158806
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
d 2 -0.088124
3 -1.746533
dtype: float64
分层索引
frame = pd. DataFrame( np. arange( 12 ) . reshape( ( 4 , 3 ) ) , index= [ [ 'a' , 'a' , 'b' , 'b' ] , [ 1 , 2 , 1 , 2 ] ] , columns= [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Gerrn' ] ] )
frame
Ohio
Colorado
Green
Red
Gerrn
a
1
0
1
2
2
3
4
5
b
1
6
7
8
2
9
10
11
添加索引名称
frame. index. names = [ 'key1' , 'key2' ]
frame. columns. names = [ 'state' , 'color' ]
frame
state
Ohio
Colorado
color
Green
Red
Gerrn
key1
key2
a
1
0
1
2
2
3
4
5
b
1
6
7
8
2
9
10
11
列索引
frame[ 'Ohio' ]
color
Green
Red
key1
key2
a
1
0
1
2
3
4
b
1
6
7
2
9
10
from pandas import MultiIndex
使用MultiIndex创造带有层级的列
MultiIndex. from_arrays( [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Gerrn' ] ] , names= [ 'state' , 'color' ] )
MultiIndex([( 'Ohio', 'Green'),
( 'Ohio', 'Red'),
('Colorado', 'Gerrn')],
names=['state', 'color'])
1.1重排序和层级排序
frame. swaplevel( 'key1' , 'key2' )
state
Ohio
Colorado
color
Green
Red
Gerrn
key2
key1
1
a
0
1
2
2
a
3
4
5
1
b
6
7
8
2
b
9
10
11
frame. sort_index( level= 1 )
state
Ohio
Colorado
color
Green
Red
Gerrn
key1
key2
a
1
0
1
2
b
1
6
7
8
a
2
3
4
5
b
2
9
10
11
frame. swaplevel( 0 , 1 ) . sort_index( level= 0 )
state
Ohio
Colorado
color
Green
Red
Gerrn
key2
key1
1
a
0
1
2
b
6
7
8
2
a
3
4
5
b
9
10
11
1.2按层级进行汇总统计
frame. sum ( level= 'key2' )
state
Ohio
Colorado
color
Green
Red
Gerrn
key2
1
6
8
10
2
12
14
16
frame. sum ( level= 'color' , axis= 1 )
color
Green
Red
Gerrn
key1
key2
a
1
0
1
2
2
3
4
5
b
1
6
7
8
2
9
10
11
1.3使用列进行索引
frame = pd. DataFrame( {
'a' : range ( 7 ) , 'b' : range ( 7 , 0 , - 1 ) , 'c' : [ 'one' , 'one' , 'one' , 'two' , 'two' , 'two' , 'two' ] , 'd' : [ 0 , 1 , 2 , 0 , 1 , 2 , 3 ] } )
frame
a
b
c
d
0
0
7
one
0
1
1
6
one
1
2
2
5
one
2
3
3
4
two
0
4
4
3
two
1
5
5
2
two
2
6
6
1
two
3
frame2 = frame. set_index( [ 'c' , 'd' ] )
frame2
a
b
c
d
one
0
0
7
1
1
6
2
2
5
two
0
3
4
1
4
3
2
5
2
3
6
1
frame. set_index( [ 'c' , 'd' ] , drop= False )
a
b
c
d
c
d
one
0
0
7
one
0
1
1
6
one
1
2
2
5
one
2
two
0
3
4
two
0
1
4
3
two
1
2
5
2
two
2
3
6
1
two
3
frame2. reset_index( )
c
d
a
b
0
one
0
0
7
1
one
1
1
6
2
one
2
2
5
3
two
0
3
4
4
two
1
4
3
5
two
2
5
2
6
two
3
6
1
2联合与合并数据集
2.1数据库分格的DateFrame连接
df1 = pd. DataFrame( {
'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ] , 'data1' : range ( 7 ) } )
df1
key
data1
0
b
0
1
b
1
2
a
2
3
c
3
4
a
4
5
a
5
6
b
6
df2 = pd. DataFrame( {
'key' : [ 'a' , 'b' , 'd' ] , 'data2' : range ( 3 ) } )
df2
key
data2
0
a
0
1
b
1
2
d
2
pd. merge( df1, df2)
key
data1
data2
0
b
0
1
1
b
1
1
2
b
6
1
3
a
2
0
4
a
4
0
5
a
5
0
pd. merge( df1, df2, on= 'key' )
key
data1
data2
0
b
0
1
1
b
1
1
2
b
6
1
3
a
2
0
4
a
4
0
5
a
5
0
df3 = pd. DataFrame( {
'lkey' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ] , 'data1' : range ( 7 ) } )
df3
lkey
data1
0
b
0
1
b
1
2
a
2
3
c
3
4
a
4
5
a
5
6
b
6
df4 = pd. DataFrame( {
'rkey' : [ 'a' , 'b' , 'd' ] , 'data2' : range ( 3 ) } )
df4
rkey
data2
0
a
0
1
b
1
2
d
2
pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' )
lkey
data1
rkey
data2
0
b
0
b
1
1
b
1
b
1
2
b
6
b
1
3
a
2
a
0
4
a
4
a
0
5
a
5
a
0
pd. merge( df1, df2, how= 'outer' )
key
data1
data2
0
b
0.0
1.0
1
b
1.0
1.0
2
b
6.0
1.0
3
a
2.0
0.0
4
a
4.0
0.0
5
a
5.0
0.0
6
c
3.0
NaN
7
d
NaN
2.0
df1 = pd. DataFrame( {
'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] , 'data1' : range ( 6 ) } )
df1
key
data1
0
b
0
1
b
1
2
a
2
3
c
3
4
a
4
5
b
5
df2 = pd. DataFrame( {
'key' : [ 'a' , 'b' , 'a' , 'b' , 'd' ] , 'data2' : range ( 5 ) } )
df2
key
data2
0
a
0
1
b
1
2
a
2
3
b
3
4
d
4
pd. merge( df1, df2, on= 'key' , how= 'left' )
key
data1
data2
0
b
0
1.0
1
b
0
3.0
2
b
1
1.0
3
b
1
3.0
4
a
2
0.0
5
a
2
2.0
6
c
3
NaN
7
a
4
0.0
8
a
4
2.0
9
b
5
1.0
10
b
5
3.0
pd. merge( df1, df2, how= 'inner' )
key
data1
data2
0
b
0
1
1
b
0
3
2
b
1
1
3
b
1
3
4
b
5
1
5
b
5
3
6
a
2
0
7
a
2
2
8
a
4
0
9
a
4
2
left = pd. DataFrame( {
'key1' : [ 'foo' , 'foo' , 'bar' ] , 'key2' : [ 'one' , 'two' , 'one' ] , 'lval' : [ 1 , 2 , 3 ] } )
left
key1
key2
lval
0
foo
one
1
1
foo
two
2
2
bar
one
3
right = pd. DataFrame( {
'key1' : [ 'foo' , 'foo' , 'bar' , 'bar' ] , 'key2' : [ 'one' , 'one' , 'one' , 'two' ] , 'rval' : [ 4 , 5 , 6 , 7 ] } )
right
key1
key2
rval
0
foo
one
4
1
foo
one
5
2
bar
one
6
3
bar
two
7
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'outer' )
key1
key2
lval
rval
0
foo
one
1.0
4.0
1
foo
one
1.0
5.0
2
foo
two
2.0
NaN
3
bar
one
3.0
6.0
4
bar
two
NaN
7.0
pd. merge( left, right, on= 'key1' )
key1
key2_x
lval
key2_y
rval
0
foo
one
1
one
4
1
foo
one
1
one
5
2
foo
two
2
one
4
3
foo
two
2
one
5
4
bar
one
3
one
6
5
bar
one
3
two
7
pd. merge( left, right, on= 'key1' , suffixes= ( '_left' , '_right' ) )
key1
key2_left
lval
key2_right
rval
0
foo
one
1
one
4
1
foo
one
1
one
5
2
foo
two
2
one
4
3
foo
two
2
one
5
4
bar
one
3
one
6
5
bar
one
3
two
7
2.2根据索引合并
left1 = pd. DataFrame( {
'key' : [ 'a' , 'b' , 'a' , 'a' , 'b' , 'c' ] , 'value' : range ( 6 ) } )
left1
key
value
0
a
0
1
b
1
2
a
2
3
a
3
4
b
4
5
c
5
right1 = pd. DataFrame( {
'group_val' : [ 3.5 , 7 ] } , index= [ 'a' , 'b' ] )
right1
pd. merge( left1, right1, left_on= 'key' , right_index= True )
key
value
group_val
0
a
0
3.5
2
a
2
3.5
3
a
3
3.5
1
b
1
7.0
4
b
4
7.0
pd. merge( left1, right1, left_on= 'key' , right_index= True , how= 'outer' )
key
value
group_val
0
a
0
3.5
2
a
2
3.5
3
a
3
3.5
1
b
1
7.0
4
b
4
7.0
5
c
5
NaN
lefth = pd. DataFrame( {
'key1' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] , 'key2' : [ 2000 , 2001 , 2002 , 2001 , 2002 ] , 'data' : np. arange( 5. ) } )
lefth
key1
key2
data
0
Ohio
2000
0.0
1
Ohio
2001
1.0
2
Ohio
2002
2.0
3
Nevada
2001
3.0
4
Nevada
2002
4.0
righth = pd. DataFrame( np. arange( 12 ) . reshape( ( 6 , 2 ) ) , index= [ [ 'Nevada' , 'Nevada' , 'Ohio' , 'Ohio' , 'Ohio' , 'Ohio' ] , [ 2001 , 2000 , 2000 , 2000 , 2001 , 2002 ] ] , columns= [ 'event1' , 'event2' ] )
righth
event1
event2
Nevada
2001
0
1
2000
2
3
Ohio
2000
4
5
2000
6
7
2001
8
9
2002
10
11
pd. merge( lefth, righth, left_on= [ 'key1' , 'key2' ] , right_index= True )
key1
key2
data
event1
event2
0
Ohio
2000
0.0
4
5
0
Ohio
2000
0.0
6
7
1
Ohio
2001
1.0
8
9
2
Ohio
2002
2.0
10
11
3
Nevada
2001
3.0
0
1
pd. merge( lefth, righth, left_on= [ 'key1' , 'key2' ] , right_index= True , how= 'outer' )
key1
key2
data
event1
event2
0
Ohio
2000
0.0
4.0
5.0
0
Ohio
2000
0.0
6.0
7.0
1
Ohio
2001
1.0
8.0
9.0
2
Ohio
2002
2.0
10.0
11.0
3
Nevada
2001
3.0
0.0
1.0
4
Nevada
2002
4.0
NaN
NaN
4
Nevada
2000
NaN
2.0
3.0
使用两边的索引进行合并
left2 = pd. DataFrame( [ [ 1. , 2. ] , [ 3. , 4. ] , [ 5. , 6. ] ] , index= [ 'a' , 'c' , 'e' ] , columns= [ 'Ohio' , 'Nevada' ] )
left2
Ohio
Nevada
a
1.0
2.0
c
3.0
4.0
e
5.0
6.0
right2 = pd. DataFrame( [ [ 7. , 8. ] , [ 9. , 10. ] , [ 11. , 12. ] , [ 13. , 14. ] ] , index= [ 'b' , 'c' , 'd' , 'e' ] , columns= [ 'Missouri' , 'Alabama' ] )
right2
Missouri
Alabama
b
7.0
8.0
c
9.0
10.0
d
11.0
12.0
e
13.0
14.0
pd. merge( left2, right2, how= 'outer' , left_index= True , right_index= True )
Ohio
Nevada
Missouri
Alabama
a
1.0
2.0
NaN
NaN
b
NaN
NaN
7.0
8.0
c
3.0
4.0
9.0
10.0
d
NaN
NaN
11.0
12.0
e
5.0
6.0
13.0
14.0
left2. join( right2, how= 'outer' )
Ohio
Nevada
Missouri
Alabama
a
1.0
2.0
NaN
NaN
b
NaN
NaN
7.0
8.0
c
3.0
4.0
9.0
10.0
d
NaN
NaN
11.0
12.0
e
5.0
6.0
13.0
14.0
left1. join( right1, on= 'key' )
key
value
group_val
0
a
0
3.5
1
b
1
7.0
2
a
2
3.5
3
a
3
3.5
4
b
4
7.0
5
c
5
NaN
another = pd. DataFrame( [ [ 7. , 8. ] , [ 9. , 10. ] , [ 11. , 12. ] , [ 16. , 17. ] ] , index= [ 'a' , 'c' , 'e' , 'f' ] , columns= [ 'New York' , 'Oregon' ] )
another
New York
Oregon
a
7.0
8.0
c
9.0
10.0
e
11.0
12.0
f
16.0
17.0
left2. join( [ right2, another] )
Ohio
Nevada
Missouri
Alabama
New York
Oregon
a
1.0
2.0
NaN
NaN
7.0
8.0
c
3.0
4.0
9.0
10.0
9.0
10.0
e
5.0
6.0
13.0
14.0
11.0
12.0
left2. join( [ right2, another] , how= 'outer' )
Ohio
Nevada
Missouri
Alabama
New York
Oregon
a
1.0
2.0
NaN
NaN
7.0
8.0
c
3.0
4.0
9.0
10.0
9.0
10.0
e
5.0
6.0
13.0
14.0
11.0
12.0
b
NaN
NaN
7.0
8.0
NaN
NaN
d
NaN
NaN
11.0
12.0
NaN
NaN
f
NaN
NaN
NaN
NaN
16.0
17.0
2.3沿轴向连接
arr = np. arange( 12 ) . reshape( ( 3 , 4 ) )
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
np. concatenate( [ arr, arr] , axis= 1 )
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
s1 = pd. Series( [ 0 , 1 ] , index= [ 'a' , 'b' ] )
s1
a 0
b 1
dtype: int64
s2 = pd. Series( [ 2 , 3 , 4 ] , index= [ 'c' , 'd' , 'e' ] )
s2
c 2
d 3
e 4
dtype: int64
s3 = pd. Series( [ 5 , 6 ] , index= [ 'f' , 'g' ] )
s3
f 5
g 6
dtype: int64
pd. concat( [ s1, s2, s3] )
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
pd. concat( [ s1, s2, s3] , axis= 1 )
0
1
2
a
0.0
NaN
NaN
b
1.0
NaN
NaN
c
NaN
2.0
NaN
d
NaN
3.0
NaN
e
NaN
4.0
NaN
f
NaN
NaN
5.0
g
NaN
NaN
6.0
s4 = pd. concat( [ s1, s3] )
s4
a 0
b 1
f 5
g 6
dtype: int64
pd. concat( [ s1, s4] , axis= 1 )
0
1
a
0.0
0
b
1.0
1
f
NaN
5
g
NaN
6
pd. concat( [ s1, s4] , axis= 1 , join= 'inner' )
result = pd. concat( [ s1, s1, s3] , keys= [ 'one' , 'two' , 'three' ] )
result
one a 0
b 1
two a 0
b 1
three f 5
g 6
dtype: int64
result. unstack( )
a
b
f
g
one
0.0
1.0
NaN
NaN
two
0.0
1.0
NaN
NaN
three
NaN
NaN
5.0
6.0
pd. concat( [ s1, s2, s3] , axis= 1 , keys= [ 'one' , 'two' , 'three' ] )
one
two
three
a
0.0
NaN
NaN
b
1.0
NaN
NaN
c
NaN
2.0
NaN
d
NaN
3.0
NaN
e
NaN
4.0
NaN
f
NaN
NaN
5.0
g
NaN
NaN
6.0
df1 = pd. DataFrame( np. arange( 6 ) . reshape( 3 , 2 ) , index= [ 'a' , 'b' , 'c' ] , columns= [ 'one' , 'two' ] )
df1
one
two
a
0
1
b
2
3
c
4
5
df2 = pd. DataFrame( 5 + np. arange( 4 ) . reshape( 2 , 2 ) , index= [ 'a' , 'c' ] , columns= [ 'three' , 'four' ] )
df2
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] )
level1
level2
one
two
three
four
a
0
1
5.0
6.0
b
2
3
NaN
NaN
c
4
5
7.0
8.0
pd. concat( {
'level1' : df1, 'level2' : df2} , axis= 1 )
level1
level2
one
two
three
four
a
0
1
5.0
6.0
b
2
3
NaN
NaN
c
4
5
7.0
8.0
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] , names= [ 'upper' , 'lower' ] )
upper
level1
level2
lower
one
two
three
four
a
0
1
5.0
6.0
b
2
3
NaN
NaN
c
4
5
7.0
8.0
df1 = pd. DataFrame( np. random. randn( 3 , 4 ) , columns= [ 'a' , 'b' , 'c' , 'd' ] )
df1
a
b
c
d
0
0.030306
-0.665932
-0.180251
-0.562281
1
-0.562836
-1.362955
-1.928334
-0.991793
2
-1.382119
0.695608
0.638588
0.812288
df2 = pd. DataFrame( np. random. randn( 2 , 3 ) , columns= [ 'b' , 'd' , 'a' ] )
df2
b
d
a
0
0.323228
-1.229234
2.685688
1
-1.968068
-1.996343
-1.134667
pd. concat( [ df1, df2] , ignore_index= True )
a
b
c
d
0
0.030306
-0.665932
-0.180251
-0.562281
1
-0.562836
-1.362955
-1.928334
-0.991793
2
-1.382119
0.695608
0.638588
0.812288
3
2.685688
0.323228
NaN
-1.229234
4
-1.134667
-1.968068
NaN
-1.996343
2.4联合重叠数据
a = pd. Series( [ np. nan, 2.5 , 0.0 , 3.5 , 4.5 , np. nan] , index= [ 'f' , 'e' , 'd' , 'c' , 'b' , 'a' ] )
a
f NaN
e 2.5
d 0.0
c 3.5
b 4.5
a NaN
dtype: float64
b = pd. Series( [ 0. , np. nan, 2. , np. nan, np. nan, 5. ] , index= [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' ] )
b
a 0.0
b NaN
c 2.0
d NaN
e NaN
f 5.0
dtype: float64
np. where( pd. isnull( a) , b, a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
b. combine_first( a)
a 0.0
b 4.5
c 2.0
d 0.0
e 2.5
f 5.0
dtype: float64
df1 = pd. DataFrame( {
'a' : [ 1. , np. nan, 5. , np. nan] , 'b' : [ np. nan, 2. , np. nan, 6. ] , 'c' : range ( 2 , 18 , 4 ) } )
df1
a
b
c
0
1.0
NaN
2
1
NaN
2.0
6
2
5.0
NaN
10
3
NaN
6.0
14
df2 = pd. DataFrame( {
'a' : [ 5. , 4. , np. nan, 3. , 7. ] , 'b' : [ np. nan, 3. , 4. , 6. , 8. ] } )
df2
a
b
0
5.0
NaN
1
4.0
3.0
2
NaN
4.0
3
3.0
6.0
4
7.0
8.0
df1. combine_first( df2)
a
b
c
0
1.0
NaN
2.0
1
4.0
2.0
6.0
2
5.0
4.0
10.0
3
3.0
6.0
14.0
4
7.0
8.0
NaN
3.重塑和透视
3.1使用多层索引进行重塑
data = pd. DataFrame( np. arange( 6 ) . reshape( ( 2 , 3 ) ) , index= pd. Index( [ 'Ohio' , 'Colorado' ] , name= 'state' ) , columns= pd. Index( [ 'one' , 'two' , 'three' ] , name= 'number' ) )
data
number
one
two
three
state
Ohio
0
1
2
Colorado
3
4
5
stack将列透视到行
result = data. stack( )
result
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
result. unstack( )
number
one
two
three
state
Ohio
0
1
2
Colorado
3
4
5
result. unstack( 0 )
state
Ohio
Colorado
number
one
0
3
two
1
4
three
2
5
result. unstack( 'state' )
state
Ohio
Colorado
number
one
0
3
two
1
4
three
2
5
s1 = pd. Series( [ 0 , 1 , 2 , 3 ] , index= [ 'a' , 'b' , 'c' , 'd' ] )
s1
a 0
b 1
c 2
d 3
dtype: int64
s2 = pd. Series( [ 4 , 5 , 6 ] , index= [ 'c' , 'd' , 'e' ] )
s2
c 4
d 5
e 6
dtype: int64
data2 = pd. concat( [ s1, s2] , keys= [ 'one' , 'two' ] )
data2
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
data2. unstack( )
a
b
c
d
e
one
0.0
1.0
2.0
3.0
NaN
two
NaN
NaN
4.0
5.0
6.0
data2. unstack( ) . stack( )
one a 0.0
b 1.0
c 2.0
d 3.0
two c 4.0
d 5.0
e 6.0
dtype: float64
data2. unstack( ) . stack( dropna= False )
one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64
df = pd. DataFrame( {
'left' : result, 'right' : result+ 5 } , columns= pd. Index( [ 'left' , 'right' ] , name= 'side' ) )
df
side
left
right
state
number
Ohio
one
0
5
two
1
6
three
2
7
Colorado
one
3
8
two
4
9
three
5
10
df. unstack( 'state' )
side
left
right
state
Ohio
Colorado
Ohio
Colorado
number
one
0
3
5
8
two
1
4
6
9
three
2
5
7
10
df. unstack( 'state' ) . stack( 'side' )
state
Colorado
Ohio
number
side
one
left
3
0
right
8
5
two
left
4
1
right
9
6
three
left
5
2
right
10
7
3.2将“长”透视为“宽”
data = pd. read_csv( 'examples/macrodata.csv' )
data. head( )
year
quarter
realgdp
realcons
realinv
realgovt
realdpi
cpi
m1
tbilrate
unemp
pop
infl
realint
0
1959
1
2710.349
1707.4
286.898
470.045
1886.9
28.98
139.7
2.82
5.8
177.146
0.00
0.00
1
1959
2
2778.801
1733.7
310.859
481.301
1919.7
29.15
141.7
3.08
5.1
177.830
2.34
0.74
2
1959
3
2775.488
1751.8
289.226
491.260
1916.4
29.35
140.5
3.82
5.3
178.657
2.74
1.09
3
1959
4
2785.204
1753.7
299.356
484.052
1931.3
29.37
140.0
4.33
5.6
179.386
0.27
4.06
4
1960
1
2847.699
1770.5
331.722
462.199
1955.5
29.54
139.6
3.50
5.2
180.007
2.31
1.19
periods = pd. PeriodIndex( year= data. year, quarter= data. quarter, name= 'date' )
columns = pd. Index( [ 'realgdp' , 'infl' , 'unemp' ] , name= 'item' )
data = data. reindex( columns= columns)
data. index = periods. to_timestamp( 'D' , 'end' )
ldata = data. stack( ) . reset_index( ) . rename( columns= {
0 : 'value' } )
ldata[ : 10 ]
date
item
value
0
1959-03-31 23:59:59.999999999
realgdp
2710.349
1
1959-03-31 23:59:59.999999999
infl
0.000
2
1959-03-31 23:59:59.999999999
unemp
5.800
3
1959-06-30 23:59:59.999999999
realgdp
2778.801
4
1959-06-30 23:59:59.999999999
infl
2.340
5
1959-06-30 23:59:59.999999999
unemp
5.100
6
1959-09-30 23:59:59.999999999
realgdp
2775.488
7
1959-09-30 23:59:59.999999999
infl
2.740
8
1959-09-30 23:59:59.999999999
unemp
5.300
9
1959-12-31 23:59:59.999999999
realgdp
2785.204
pivoted = ldata. pivot( 'date' , 'item' , 'value' )
pivoted
item
infl
realgdp
unemp
date
1959-03-31 23:59:59.999999999
0.00
2710.349
5.8
1959-06-30 23:59:59.999999999
2.34
2778.801
5.1
1959-09-30 23:59:59.999999999
2.74
2775.488
5.3
1959-12-31 23:59:59.999999999
0.27
2785.204
5.6
1960-03-31 23:59:59.999999999
2.31
2847.699
5.2
...
...
...
...
2008-09-30 23:59:59.999999999
-3.16
13324.600
6.0
2008-12-31 23:59:59.999999999
-8.79
13141.920
6.9
2009-03-31 23:59:59.999999999
0.94
12925.410
8.1
2009-06-30 23:59:59.999999999
3.37
12901.504
9.2
2009-09-30 23:59:59.999999999
3.56
12990.341
9.6
203 rows × 3 columns
ldata[ 'value2' ] = np. random. randn( len ( ldata) )
ldata[ : 10 ]
date
item
value
value2
0
1959-03-31 23:59:59.999999999
realgdp
2710.349
1.417751
1
1959-03-31 23:59:59.999999999
infl
0.000
-0.843318
2
1959-03-31 23:59:59.999999999
unemp
5.800
0.752850
3
1959-06-30 23:59:59.999999999
realgdp
2778.801
1.481983
4
1959-06-30 23:59:59.999999999
infl
2.340
0.912075
5
1959-06-30 23:59:59.999999999
unemp
5.100
-0.530813
6
1959-09-30 23:59:59.999999999
realgdp
2775.488
1.443973
7
1959-09-30 23:59:59.999999999
infl
2.740
1.977522
8
1959-09-30 23:59:59.999999999
unemp
5.300
0.621740
9
1959-12-31 23:59:59.999999999
realgdp
2785.204
-0.379998
pivoted = ldata. pivot( 'date' , 'item' )
pivoted[ : 5 ]
value
value2
item
infl
realgdp
unemp
infl
realgdp
unemp
date
1959-03-31 23:59:59.999999999
0.00
2710.349
5.8
-0.843318
1.417751
0.752850
1959-06-30 23:59:59.999999999
2.34
2778.801
5.1
0.912075
1.481983
-0.530813
1959-09-30 23:59:59.999999999
2.74
2775.488
5.3
1.977522
1.443973
0.621740
1959-12-31 23:59:59.999999999
0.27
2785.204
5.6
1.356540
-0.379998
-0.738881
1960-03-31 23:59:59.999999999
2.31
2847.699
5.2
1.273626
-1.512028
-0.234333
pivoted[ 'value' ] [ : 5 ]
item
infl
realgdp
unemp
date
1959-03-31 23:59:59.999999999
0.00
2710.349
5.8
1959-06-30 23:59:59.999999999
2.34
2778.801
5.1
1959-09-30 23:59:59.999999999
2.74
2775.488
5.3
1959-12-31 23:59:59.999999999
0.27
2785.204
5.6
1960-03-31 23:59:59.999999999
2.31
2847.699
5.2
unstacked = ldata. set_index( [ 'date' , 'item' ] ) . unstack( 'item' )
unstacked[ : 7 ]
value
value2
item
infl
realgdp
unemp
infl
realgdp
unemp
date
1959-03-31 23:59:59.999999999
0.00
2710.349
5.8
-0.843318
1.417751
0.752850
1959-06-30 23:59:59.999999999
2.34
2778.801
5.1
0.912075
1.481983
-0.530813
1959-09-30 23:59:59.999999999
2.74
2775.488
5.3
1.977522
1.443973
0.621740
1959-12-31 23:59:59.999999999
0.27
2785.204
5.6
1.356540
-0.379998
-0.738881
1960-03-31 23:59:59.999999999
2.31
2847.699
5.2
1.273626
-1.512028
-0.234333
1960-06-30 23:59:59.999999999
0.14
2834.390
5.2
1.304159
-0.886630
-0.178589
1960-09-30 23:59:59.999999999
2.70
2839.022
5.6
0.120983
-0.193590
0.747989
3.3将“宽”透视为“长”
df = pd. DataFrame( {
'key' : [ 'foo' , 'bar' , 'baz' ] , 'A' : [ 1 , 2 , 3 ] , 'B' : [ 4 , 5 , 6 ] , 'C' : [ 7 , 8 , 9 ] } )
df
key
A
B
C
0
foo
1
4
7
1
bar
2
5
8
2
baz
3
6
9
melted = pd. melt( df, [ 'key' ] )
melted
key
variable
value
0
foo
A
1
1
bar
A
2
2
baz
A
3
3
foo
B
4
4
bar
B
5
5
baz
B
6
6
foo
C
7
7
bar
C
8
8
baz
C
9
reshaped = melted. pivot( 'key' , 'variable' , 'value' )
reshaped
variable
A
B
C
key
bar
2
5
8
baz
3
6
9
foo
1
4
7
reshaped. reset_index( )
variable
key
A
B
C
0
bar
2
5
8
1
baz
3
6
9
2
foo
1
4
7
pd. melt( df, value_vars= [ 'A' , 'B' , 'C' ] )
variable
value
0
A
1
1
A
2
2
A
3
3
B
4
4
B
5
5
B
6
6
C
7
7
C
8
8
C
9
pd. melt( df, value_vars= [ 'key' , 'A' , 'B' ] )
variable
value
0
key
foo
1
key
bar
2
key
baz
3
A
1
4
A
2
5
A
3
6
B
4
7
B
5
8
B
6