版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/fantacy10000/article/details/82056375
import pandas as pd
import numpy as np
tr1=pd.DataFrame({'key' :['b' ,'b' ,'a' ,'c' ,'a' ,'a' ,'b' ],'data1' :range(7 )})
tr2=pd.DataFrame({'key' :['a' ,'b' ,'d' ],'data2' :range(3 )})
print(tr1)
print(tr2)
tr3=pd.merge(tr1,tr2,on='key' ,how='outer' )
print(tr3)
data1 key 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 a 6 6 b data2 key 0 0 a 1 1 b 2 2 d data1 key data2 0 0.0 b 1.0 1 1.0 b 1.0 2 6.0 b 1.0 3 2.0 a 0.0 4 4.0 a 0.0 5 5.0 a 0.0 6 3.0 c NaN 7 NaN d 2.0
left1=pd.DataFrame({'key' :['a' ,'b' ,'c' ],'value' :range(3 )})
right1=pd.DataFrame({'group_val' :[3 ,7 ]},index=['a' ,'b' ])
tr3=pd.merge(left1,right1,left_on='key' ,right_index=True )
print(left1)
print(right1)
print(tr3)
key value 0 a 0 1 b 1 2 c 2 group_val a 3 b 7 key value group_val 0 a 0 3 1 b 1 7
arr=np.arange(12 ).reshape((3 ,4 ))
print(arr)
arr1=np.concatenate([arr,arr],axis=1 )
print(arr1)
arr2=np.concatenate([arr,arr],axis=0 )
print(arr2)
arr1=pd.Series([1 ,2 ,3 ,4 ],index=['a' ,'b' ,'c' ,'d' ])
arr2=pd.Series([1 ,4 ],index=['b' ,'c' ])
arr3=pd.Series([3 ,5 ,6 ],index=['d' ,'e' ,'f' ])
arr4=pd.concat([arr1,arr2,arr3],axis=1 )
print(arr4)
[[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] [[ 0 1 2 3 0 1 2 3] [ 4 5 6 7 4 5 6 7] [ 8 9 10 11 8 9 10 11]] [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11] [ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] 0 1 2 a 1.0 NaN NaN b 2.0 1.0 NaN c 3.0 4.0 NaN d 4.0 NaN 3.0 e NaN NaN 5.0 f NaN NaN 6.0
a=pd.Series([np.nan,2.5 ,np.nan,3.5 ,4.5 ,np.nan],index=['f' ,'e' ,'d' ,'c' ,'b' ,'a' ])
b=pd.Series(np.arange(len(a),dtype=np.float64),index=['f' ,'e' ,'d' ,'c' ,'b' ,'a' ])
c=np.where(pd.isnull(a),b,a)
b[:-2 ].combine_first(a[2 :])
a NaN b 4.5 c 3.0 d 2.0 e 1.0 f 0.0 dtype: float64
data=pd.DataFrame({'k1' :['one' ]*3 +['two' ]*4 ,'k2' :[1 ,1 ,2 ,3 ,3 ,4 ,4 ]})
print(data.duplicated())
print(data.drop_duplicates())
0 False 1 True 2 False 3 False 4 True 5 False 6 True dtype: bool k1 k2 0 one 1 2 one 2 3 two 3 5 two 4
data=pd.Series([1 ,2 ,3 ,4 ,-999 ,-1000 ,-999 ,1 ])
print(data.replace(-999 ,np.nan))
print(data.replace([-999 ,-1000 ],[np.nan,1 ]))
0 1.0 1 2.0 2 3.0 3 4.0 4 NaN 5 -1000.0 6 NaN 7 1.0 dtype: float64 0 1.0 1 2.0 2 3.0 3 4.0 4 NaN 5 1.0 6 NaN 7 1.0 dtype: float64
data=pd.DataFrame(np.arange(12 ).reshape(3 ,4 ),index=['ohio' ,'colorado' ,'new york' ],columns=['one' ,'two' ,'three' ,'four' ])
print(data)
print(data.rename(index=str.title,columns=str.upper))
print(data.rename(index={'Ohio' :'invida' },columns={'four' :'hello' }))
data=pd.DataFrame(np.arange(12 ).reshape(3 ,4 ),index=['ohio' ,'colorado' ,'new york' ],columns=['one' ,'two' ,'three' ,'four' ])
_=data.rename(index={'ohio' :'invida' },inplace=True )
print(data)
one two three four ohio 0 1 2 3 colorado 4 5 6 7 new york 8 9 10 11 ONE TWO THREE FOUR Ohio 0 1 2 3 Colorado 4 5 6 7 New York 8 9 10 11 one two three hello ohio 0 1 2 3 colorado 4 5 6 7 new york 8 9 10 11 one two three four invida 0 1 2 3 colorado 4 5 6 7 new york 8 9 10 11
data=pd.DataFrame(np.random.randn(1000 ,4 ))
data.describe()
col=data[3 ]
col[np.abs(col)>3 ]
data[(np.abs(data)>3 ).any(1 )]
0
1
2
3
18
0.609418
3.324428
-0.108789
0.171618
99
-1.066103
-0.353316
-0.365273
3.555021
329
-1.657900
-1.991298
0.901694
3.395590
438
-0.473641
1.164948
0.604448
3.638542
500
0.572468
-0.243033
0.293410
-3.010369
513
3.265122
-1.466344
0.467078
-0.578180
614
3.213056
0.014980
0.214329
-0.591175
764
1.928544
-0.904684
0.339387
-3.383201
858
3.284090
0.404298
0.687844
0.414698
df=pd.DataFrame(np.arange(5 *4 ).reshape(5 ,4 ))
sampler=np.random.permutation(5 )
print(sampler)
df.take(sampler)
[2 3 0 4 1]
0
1
2
3
2
8
9
10
11
3
12
13
14
15
0
0
1
2
3
4
16
17
18
19
1
4
5
6
7