import pandas as pd
import numpy as np
df = pd. DataFrame( { 'a' : [ 1 , 2 ] * 3 ,
'b' : [ True , False ] * 3 ,
'c' : [ 1.0 , 2.0 ] * 3 ,
'e' : [ 'asian' , 'white' , 'black' , 'white' , 'asian' , 'white' ] ,
'd' : [ 'low' , 'low' , 'low' , 'median' , 'high' , 'high' ] } )
df
a
b
c
e
d
0
1
True
1.0
asian
low
1
2
False
2.0
white
low
2
1
True
1.0
black
low
3
2
False
2.0
white
median
4
1
True
1.0
asian
high
5
2
False
2.0
white
high
df. dtypes
a int64
b bool
c float64
e object
d object
dtype: object
df[ 'd' ] = df[ 'd' ] . astype( 'category' )
df
a
b
c
e
d
0
1
True
1.0
asian
low
1
2
False
2.0
white
low
2
1
True
1.0
black
low
3
2
False
2.0
white
median
4
1
True
1.0
asian
high
5
2
False
2.0
white
high
df. dtypes
a int64
b bool
c float64
e object
d category
dtype: object
df. select_dtypes( include= 'bool' )
b
0
True
1
False
2
True
3
False
4
True
5
False
df. select_dtypes( include= 'float64' )
c
0
1.0
1
2.0
2
1.0
3
2.0
4
1.0
5
2.0
df. select_dtypes( include= 'number' )
a
c
0
1
1.0
1
2
2.0
2
1
1.0
3
2
2.0
4
1
1.0
5
2
2.0
df. select_dtypes( include= 'category' )
d
0
low
1
low
2
low
3
median
4
high
5
high
df. select_dtypes( include= 'object' )
e
0
asian
1
white
2
black
3
white
4
asian
5
white
df. select_dtypes( exclude= [ 'float64' ] )
a
b
e
d
0
1
True
asian
low
1
2
False
white
low
2
1
True
black
low
3
2
False
white
median
4
1
True
asian
high
5
2
False
white
high
df = pd. DataFrame( np. arange( 12 ) . reshape( 3 , 4 ) , columns= [ 'A' , 'B' , 'C' , 'D' ] )
df
A
B
C
D
0
0
1
2
3
1
4
5
6
7
2
8
9
10
11
df. drop( [ 'B' , 'C' ] , axis= 1 )
df. drop( [ 0 , 1 ] )
s = pd. Series( [ "a" , "b" , np. nan, "c" , None ] )
print ( s)
0 a
1 b
2 NaN
3 c
4 None
dtype: object
print ( s. isnull( ) )
0 False
1 False
2 True
3 False
4 True
dtype: bool
a = pd. Series( [ 1 , 2 , np. nan, 3 , None ] )
print ( s[ s. isnull( ) ] )
2 NaN
4 None
dtype: object
a = pd. Series( [ 1 , 2 , np. nan, 3 , None ] )
a. sum ( )
6.0
a = [ [ 1 , np. nan, 2 ] , [ 9 , None , np. nan] , [ 3 , 4 , None ] , [ 5 , 6 , 7 ] ]
data = pd. DataFrame( a)
data
0
1
2
0
1
NaN
2.0
1
9
NaN
NaN
2
3
4.0
NaN
3
5
6.0
7.0
data. dropna( )
data. dropna( axis= 1 )
a = [ [ 1 , np. nan, 2 ] , [ np. nan, None , np. nan] , [ 3 , None , None ] , [ 5 , None , 7 ] ]
data = pd. DataFrame( a)
print ( data)
print ( data. dropna( how= "all" ) )
print ( data. dropna( how= "all" , axis= 1 ) )
0 1 2
0 1.0 NaN 2.0
1 NaN NaN NaN
2 3.0 NaN NaN
3 5.0 NaN 7.0
0 1 2
0 1.0 NaN 2.0
2 3.0 NaN NaN
3 5.0 NaN 7.0
0 2
0 1.0 2.0
1 NaN NaN
2 3.0 NaN
3 5.0 7.0
a = [ [ 1 , 2 , 2 ] , [ 3 , None , 6 ] , [ 3 , 7 , None ] , [ 5 , None , 7 ] ]
data = pd. DataFrame( a)
print ( data)
print ( data. fillna( 0 ) )
0 1 2
0 1 2.0 2.0
1 3 NaN 6.0
2 3 7.0 NaN
3 5 NaN 7.0
0 1 2
0 1 2.0 2.0
1 3 0.0 6.0
2 3 7.0 0.0
3 5 0.0 7.0
print ( data. fillna( { 1 : 1 , 2 : 2 } ) )
print ( data. fillna( data. mean( ) ) )
0 1 2
0 1 2.0 2.0
1 3 1.0 6.0
2 3 7.0 2.0
3 5 1.0 7.0
0 1 2
0 1 2.0 2.0
1 3 4.5 6.0
2 3 7.0 5.0
3 5 4.5 7.0
from sklearn import preprocessing
import numpy as np
X_train = np. array( [ [ 1 . , - 1 . , 2 . ] ,
[ 2 . , 0 . , 0 . ] ,
[ 0 . , 1 . , - 1 . ] ] )
X_scaled = preprocessing. scale( X_train)
X_scaled
array([[ 0. , -1.22474487, 1.33630621],
[ 1.22474487, 0. , -0.26726124],
[-1.22474487, 1.22474487, -1.06904497]])
X_scaled. mean( axis= 0 )
array([0., 0., 0.])
X_scaled. std( axis= 0 )
array([1., 1., 1.])
df = pd. DataFrame( { 'col_a' : np. arange( 10 ) ,
'col_b' : np. random. randn( 10 ) ,
'col_c' : np. random. choice( [ 'A' , 'B' , 'C' ] , 10 ) ,
'col_d' : np. random. choice( [ 0 , 1 ] , 10 ) } )
df
col_a
col_b
col_c
col_d
0
0
2.182928
B
1
1
1
-0.830507
B
0
2
2
-0.497002
B
0
3
3
1.485496
B
0
4
4
1.302028
C
1
5
5
0.480743
A
1
6
6
-0.828251
B
0
7
7
-1.771108
C
0
8
8
-0.607708
A
1
9
9
1.938848
C
1
print ( df. shape, df. shape[ 0 ] , df. shape[ 1 ] )
(10, 4) 10 4
df. columns
Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
df. iloc[ : 5 ]
col_a
col_b
col_c
col_d
0
0
2.182928
B
1
1
1
-0.830507
B
0
2
2
-0.497002
B
0
3
3
1.485496
B
0
4
4
1.302028
C
1
df[ [ 'col_a' , 'col_b' ] ]
col_a
col_b
0
0
2.182928
1
1
-0.830507
2
2
-0.497002
3
3
1.485496
4
4
1.302028
5
5
0.480743
6
6
-0.828251
7
7
-1.771108
8
8
-0.607708
9
9
1.938848
df. iloc[ : 5 , : 2 ]
col_a
col_b
0
0
2.182928
1
1
-0.830507
2
2
-0.497002
3
3
1.485496
4
4
1.302028
df. iat[ 0 , 1 ]
2.182928374642522
df[ ( df[ 'col_a' ] > 3 ) & ( df[ 'col_b' ] < 0 ) ]
col_a
col_b
col_c
col_d
6
6
-0.828251
B
0
7
7
-1.771108
C
0
8
8
-0.607708
A
1
df[ df[ 'col_c' ] . isin( [ 'A' , 'B' ] ) ]
col_a
col_b
col_c
col_d
0
0
2.182928
B
1
1
1
-0.830507
B
0
2
2
-0.497002
B
0
3
3
1.485496
B
0
5
5
0.480743
A
1
6
6
-0.828251
B
0
8
8
-0.607708
A
1
df[ 'col_e' ] = df[ 'col_a' ] + df[ 'col_b' ]
df
col_a
col_b
col_c
col_d
col_e
0
0
2.182928
B
1
2.182928
1
1
-0.830507
B
0
0.169493
2
2
-0.497002
B
0
1.502998
3
3
1.485496
B
0
4.485496
4
4
1.302028
C
1
5.302028
5
5
0.480743
A
1
5.480743
6
6
-0.828251
B
0
5.171749
7
7
-1.771108
C
0
5.228892
8
8
-0.607708
A
1
7.392292
9
9
1.938848
C
1
10.938848
df = df. drop( columns= 'col_e' )
df
col_a
col_b
col_c
col_d
0
0
2.182928
B
1
1
1
-0.830507
B
0
2
2
-0.497002
B
0
3
3
1.485496
B
0
4
4
1.302028
C
1
5
5
0.480743
A
1
6
6
-0.828251
B
0
7
7
-1.771108
C
0
8
8
-0.607708
A
1
9
9
1.938848
C
1
df. drop( columns= df. columns[ 0 ] )
col_b
col_c
col_d
0
2.182928
B
1
1
-0.830507
B
0
2
-0.497002
B
0
3
1.485496
B
0
4
1.302028
C
1
5
0.480743
A
1
6
-0.828251
B
0
7
-1.771108
C
0
8
-0.607708
A
1
9
1.938848
C
1
df. T
0
1
2
3
4
5
6
7
8
9
col_a
0
1
2
3
4
5
6
7
8
9
col_b
2.18293
-0.830507
-0.497002
1.4855
1.30203
0.480743
-0.828251
-1.77111
-0.607708
1.93885
col_c
B
B
B
B
C
A
B
C
A
C
col_d
1
0
0
0
1
1
0
0
1
1
df[ 'col_a' ] . astype( str )
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
Name: col_a, dtype: object
pd. Categorical( df[ 'col_c' ] )
[B, B, B, B, C, A, B, C, A, C]
Categories (3, object): [A, B, C]
df[ [ 'col_a' , 'col_b' ] ] . sum ( axis= 1 )
0 2.182928
1 0.169493
2 1.502998
3 4.485496
4 5.302028
5 5.480743
6 5.171749
7 5.228892
8 7.392292
9 10.938848
dtype: float64
df[ [ 'col_a' , 'col_b' ] ] . mean( axis= 0 )
col_a 4.500000
col_b 0.285547
dtype: float64
df[ [ 'col_a' , 'col_b' ] ] . apply ( lambda x: x. mean( ) + 10 )
col_a 14.500000
col_b 10.285547
dtype: float64
df2 = pd. DataFrame( { 'col_x' : np. arange( 10 ) ,
'col_y' : np. arange( 10 ) [ : : - 1 ] } )
df2
col_x
col_y
0
0
9
1
1
8
2
2
7
3
3
6
4
4
5
5
5
4
6
6
3
7
7
2
8
8
1
9
9
0
pd. concat( [ df, df2] , axis= 1 )
col_a
col_b
col_c
col_d
col_x
col_y
0
0
2.182928
B
1
0
9
1
1
-0.830507
B
0
1
8
2
2
-0.497002
B
0
2
7
3
3
1.485496
B
0
3
6
4
4
1.302028
C
1
4
5
5
5
0.480743
A
1
5
4
6
6
-0.828251
B
0
6
3
7
7
-1.771108
C
0
7
2
8
8
-0.607708
A
1
8
1
9
9
1.938848
C
1
9
0
df3 = pd. DataFrame( { 'col_a' : [ - 1 , - 2 ] ,
'col_b' : [ 0 , 1 ] ,
'col_c' : [ 'B' , 'C' ] ,
'col_d' : [ 1 , 0 ] } )
df3
col_a
col_b
col_c
col_d
0
-1
0
B
1
1
-2
1
C
0
pd. concat( [ df, df3] , axis= 0 , ignore_index= True )
col_a
col_b
col_c
col_d
0
0
2.182928
B
1
1
1
-0.830507
B
0
2
2
-0.497002
B
0
3
3
1.485496
B
0
4
4
1.302028
C
1
5
5
0.480743
A
1
6
6
-0.828251
B
0
7
7
-1.771108
C
0
8
8
-0.607708
A
1
9
9
1.938848
C
1
10
-1
0.000000
B
1
11
-2
1.000000
C
0
data = pd. read_csv( 'https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv' , index_col= 0 )
data
carat
cut
color
clarity
depth
table
price
x
y
z
1
0.23
Ideal
E
SI2
61.5
55.0
326
3.95
3.98
2.43
2
0.21
Premium
E
SI1
59.8
61.0
326
3.89
3.84
2.31
3
0.23
Good
E
VS1
56.9
65.0
327
4.05
4.07
2.31
4
0.29
Premium
I
VS2
62.4
58.0
334
4.20
4.23
2.63
5
0.31
Good
J
SI2
63.3
58.0
335
4.34
4.35
2.75
...
...
...
...
...
...
...
...
...
...
...
53936
0.72
Ideal
D
SI1
60.8
57.0
2757
5.75
5.76
3.50
53937
0.72
Good
D
SI1
63.1
55.0
2757
5.69
5.75
3.61
53938
0.70
Very Good
D
SI1
62.8
60.0
2757
5.66
5.68
3.56
53939
0.86
Premium
H
SI2
61.0
58.0
2757
6.15
6.12
3.74
53940
0.75
Ideal
D
SI2
62.2
55.0
2757
5.83
5.87
3.64
53940 rows × 10 columns
cor_matrix = data. corr( )
data. corr( )
carat
depth
table
price
x
y
z
carat
1.000000
0.028224
0.181618
0.921591
0.975094
0.951722
0.953387
depth
0.028224
1.000000
-0.295779
-0.010647
-0.025289
-0.029341
0.094924
table
0.181618
-0.295779
1.000000
0.127134
0.195344
0.183760
0.150929
price
0.921591
-0.010647
0.127134
1.000000
0.884435
0.865421
0.861249
x
0.975094
-0.025289
0.195344
0.884435
1.000000
0.974701
0.970772
y
0.951722
-0.029341
0.183760
0.865421
0.974701
1.000000
0.952006
z
0.953387
0.094924
0.150929
0.861249
0.970772
0.952006
1.000000
data. corr( ) [ 'price' ]
carat 0.921591
depth -0.010647
table 0.127134
price 1.000000
x 0.884435
y 0.865421
z 0.861249
Name: price, dtype: float64
data[ 'price' ] . corr( data[ "x" ] )
0.8844351610161268
data. corr( method= 'spearman' )
carat
depth
table
price
x
y
z
carat
1.000000
0.030104
0.194980
0.962883
0.996117
0.995572
0.993183
depth
0.030104
1.000000
-0.245061
0.010020
-0.023442
-0.025425
0.103498
table
0.194980
-0.245061
1.000000
0.171784
0.202231
0.195734
0.159878
price
0.962883
0.010020
0.171784
1.000000
0.963196
0.962719
0.957232
x
0.996117
-0.023442
0.202231
0.963196
1.000000
0.997895
0.987355
y
0.995572
-0.025425
0.195734
0.962719
0.997895
1.000000
0.987068
z
0.993183
0.103498
0.159878
0.957232
0.987355
0.987068
1.000000
data. corr( method= 'pearson' ) [ 'price' ]
carat 0.921591
depth -0.010647
table 0.127134
price 1.000000
x 0.884435
y 0.865421
z 0.861249
Name: price, dtype: float64
data[ 'price' ] . corr( data[ "x" ] , method= 'pearson' )
0.8844351610161268
from numpy. random import rand
from numpy. random import seed
from scipy. stats import spearmanr
seed( 1 )
data1 = data[ 'x' ]
data2 = data[ 'price' ]
coef, p = spearmanr( data1, data2)
print ( 'Spearmans correlation coefficient: %.3f' % coef)
Spearmans correlation coefficient: 0.963
alpha = 0.05
if p > alpha:
print ( 'Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else :
print ( 'Samples are correlated (reject H0) p=%.3f' % p)
Samples are correlated (reject H0) p=0.000
p
0.0
from scipy. stats import kendalltau
seed( 1 )
coef, p = kendalltau( data1, data2)
print ( 'Kendall correlation coefficient: %.3f' % coef)
alpha = 0.05
if p > alpha:
print ( 'Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else :
print ( 'Samples are correlated (reject H0) p=%.3f' % p)
Kendall correlation coefficient: 0.831
Samples are correlated (reject H0) p=0.000
from scipy import stats
from scipy. stats import pearsonr
seed( 1 )
coef, p = pearsonr( data1, data2)
print ( 'pearsonr correlation coefficient: %.3f' % coef)
alpha = 0.05
if p > alpha:
print ( 'Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else :
print ( 'Samples are correlated (reject H0) p=%.3f' % p)
pearsonr correlation coefficient: 0.884
Samples are correlated (reject H0) p=0.000