基础训练,第二章py

import pandas as pd
import numpy as np

df = pd.DataFrame({'a': [1, 2] * 3,
                   'b': [True, False] * 3,
                    'c': [1.0, 2.0] * 3,
                    'e' : ['asian','white','black','white','asian','white'] ,
                  'd' :['low','low','low','median','high','high']})
df
a b c e d
0 1 True 1.0 asian low
1 2 False 2.0 white low
2 1 True 1.0 black low
3 2 False 2.0 white median
4 1 True 1.0 asian high
5 2 False 2.0 white high
df.dtypes
a      int64
b       bool
c    float64
e     object
d     object
dtype: object
df['d'] = df['d'].astype('category')
df
a b c e d
0 1 True 1.0 asian low
1 2 False 2.0 white low
2 1 True 1.0 black low
3 2 False 2.0 white median
4 1 True 1.0 asian high
5 2 False 2.0 white high
df.dtypes
a       int64
b        bool
c     float64
e      object
d    category
dtype: object
#d列为‘category’分类类型,
df.select_dtypes(include='bool')
b
0 True
1 False
2 True
3 False
4 True
5 False
df.select_dtypes(include='float64')
c
0 1.0
1 2.0
2 1.0
3 2.0
4 1.0
5 2.0
df.select_dtypes(include='number')
a c
0 1 1.0
1 2 2.0
2 1 1.0
3 2 2.0
4 1 1.0
5 2 2.0
df.select_dtypes(include='category')
d
0 low
1 low
2 low
3 median
4 high
5 high
df.select_dtypes(include='object')
e
0 asian
1 white
2 black
3 white
4 asian
5 white
df.select_dtypes(exclude=['float64'])

a b e d
0 1 True asian low
1 2 False white low
2 1 True black low
3 2 False white median
4 1 True asian high
5 2 False white high
df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])

df
A B C D
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df.drop(['B', 'C'], axis=1)#df.drop(columns=['B', 'C'])
A D
0 0 3
1 4 7
2 8 11
#Drop rows by index

df.drop([0, 1])

A B C D
2 8 9 10 11
s = pd.Series(["a","b",np.nan,"c",None])
print(s)
0       a
1       b
2     NaN
3       c
4    None
dtype: object
print(s.isnull())
0    False
1    False
2     True
3    False
4     True
dtype: bool
a  = pd.Series([1,2,np.nan,3,None])
print(s[s.isnull()])
2     NaN
4    None
dtype: object
a  = pd.Series([1,2,np.nan,3,None])
a.sum()
6.0
#此外pandas一共提供了4个针对缺失数据进行操作的函数,分别是isnull(),notnull(),dropna(),fillna()。
a = [[1, np.nan, 2],[9,None,np.nan],[3, 4, None],[5,6,7]]
data = pd.DataFrame(a)
data
0 1 2
0 1 NaN 2.0
1 9 NaN NaN
2 3 4.0 NaN
3 5 6.0 7.0
data.dropna()
0 1 2
3 5 6.0 7.0
data.dropna(axis=1)
0
0 1
1 9
2 3
3 5
a = [[1, np.nan, 2],[np.nan,None,np.nan],[3, None, None],[5,None,7]]
data = pd.DataFrame(a)
print(data)
print(data.dropna(how="all"))
print(data.dropna(how="all",axis=1))
     0   1    2
0  1.0 NaN  2.0
1  NaN NaN  NaN
2  3.0 NaN  NaN
3  5.0 NaN  7.0
     0   1    2
0  1.0 NaN  2.0
2  3.0 NaN  NaN
3  5.0 NaN  7.0
     0    2
0  1.0  2.0
1  NaN  NaN
2  3.0  NaN
3  5.0  7.0
a = [[1, 2, 2],[3,None,6],[3, 7, None],[5,None,7]]
data = pd.DataFrame(a)
print(data)
#用0填充所有的缺失数据
print(data.fillna(0))
   0    1    2
0  1  2.0  2.0
1  3  NaN  6.0
2  3  7.0  NaN
3  5  NaN  7.0
   0    1    2
0  1  2.0  2.0
1  3  0.0  6.0
2  3  7.0  0.0
3  5  0.0  7.0
#不同列使用不同的填充值
print(data.fillna({1:1,2:2}))
print(data.fillna(data.mean()))
   0    1    2
0  1  2.0  2.0
1  3  1.0  6.0
2  3  7.0  2.0
3  5  1.0  7.0
   0    1    2
0  1  2.0  2.0
1  3  4.5  6.0
2  3  7.0  5.0
3  5  4.5  7.0
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

X_scaled
array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])
X_scaled.mean(axis=0)
#Out[29]: array([0., 0., 0.])


array([0., 0., 0.])
X_scaled.std(axis=0)
#Out[30]: array([1., 1., 1.])
array([1., 1., 1.])
df = pd.DataFrame({'col_a': np.arange(10), 
                   'col_b': np.random.randn(10), 
                   'col_c': np.random.choice(['A', 'B', 'C'], 10), 
                   'col_d': np.random.choice([0, 1], 10)})

df
col_a col_b col_c col_d
0 0 2.182928 B 1
1 1 -0.830507 B 0
2 2 -0.497002 B 0
3 3 1.485496 B 0
4 4 1.302028 C 1
5 5 0.480743 A 1
6 6 -0.828251 B 0
7 7 -1.771108 C 0
8 8 -0.607708 A 1
9 9 1.938848 C 1
# R code:
# df <- data.frame(col_a = 0:9,
#                  col_b = rnorm(10),
#                  col_c = sample(c('A', 'B', 'C'), size = 10, replace = TRUE),
#                  col_d = sample(c(0, 1), size = 10, replace = TRUE), 
#                  stringsAsFactors = FALSE)
# head(df, 5)
print(df.shape, df.shape[0], df.shape[1])

# R code:
# dim(df), rnow(df), ncol(df)


(10, 4) 10 4
df.columns
# R code:
# names(df)
Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
# 选取前5行数据
df.iloc[:5]

# R code:
# df[1:5, ]
col_a col_b col_c col_d
0 0 2.182928 B 1
1 1 -0.830507 B 0
2 2 -0.497002 B 0
3 3 1.485496 B 0
4 4 1.302028 C 1
# 选取col_a和col_b列
df[['col_a', 'col_b']]

# R code:
# df[, c('col_a', 'col_b')]
col_a col_b
0 0 2.182928
1 1 -0.830507
2 2 -0.497002
3 3 1.485496
4 4 1.302028
5 5 0.480743
6 6 -0.828251
7 7 -1.771108
8 8 -0.607708
9 9 1.938848
# 选取前5行和前2列
df.iloc[:5, :2]

# R code:
# df[1:5, 1:2]
col_a col_b
0 0 2.182928
1 1 -0.830507
2 2 -0.497002
3 3 1.485496
4 4 1.302028
# 选取单个值(scalar)
df.iat[0, 1]

# R code:
# df[1, 2]
2.182928374642522
df[(df['col_a'] > 3) & (df['col_b'] < 0)]
# or 
# df.query('col_a > 3 & col_b < 0')

# R code:
# df[df$col_a > 3 & df$col_b < 0, ]
col_a col_b col_c col_d
6 6 -0.828251 B 0
7 7 -1.771108 C 0
8 8 -0.607708 A 1
df[df['col_c'].isin(['A', 'B'])]

# R code:
# df[df$col_c %in% c('A', 'B'), ]
col_a col_b col_c col_d
0 0 2.182928 B 1
1 1 -0.830507 B 0
2 2 -0.497002 B 0
3 3 1.485496 B 0
5 5 0.480743 A 1
6 6 -0.828251 B 0
8 8 -0.607708 A 1
df['col_e'] = df['col_a'] + df['col_b']
df

# df$col_e <- df$col_a + df$col_b
col_a col_b col_c col_d col_e
0 0 2.182928 B 1 2.182928
1 1 -0.830507 B 0 0.169493
2 2 -0.497002 B 0 1.502998
3 3 1.485496 B 0 4.485496
4 4 1.302028 C 1 5.302028
5 5 0.480743 A 1 5.480743
6 6 -0.828251 B 0 5.171749
7 7 -1.771108 C 0 5.228892
8 8 -0.607708 A 1 7.392292
9 9 1.938848 C 1 10.938848
# 删除col_e列
df = df.drop(columns='col_e')
df

# R code:
# df <- df[, !names(df) == 'col_e']
col_a col_b col_c col_d
0 0 2.182928 B 1
1 1 -0.830507 B 0
2 2 -0.497002 B 0
3 3 1.485496 B 0
4 4 1.302028 C 1
5 5 0.480743 A 1
6 6 -0.828251 B 0
7 7 -1.771108 C 0
8 8 -0.607708 A 1
9 9 1.938848 C 1
# 删除第一列
df.drop(columns=df.columns[0])

# R code:
# df[, -1]
col_b col_c col_d
0 2.182928 B 1
1 -0.830507 B 0
2 -0.497002 B 0
3 1.485496 B 0
4 1.302028 C 1
5 0.480743 A 1
6 -0.828251 B 0
7 -1.771108 C 0
8 -0.607708 A 1
9 1.938848 C 1
df.T

# R code:
# t(df)
0 1 2 3 4 5 6 7 8 9
col_a 0 1 2 3 4 5 6 7 8 9
col_b 2.18293 -0.830507 -0.497002 1.4855 1.30203 0.480743 -0.828251 -1.77111 -0.607708 1.93885
col_c B B B B C A B C A C
col_d 1 0 0 0 1 1 0 0 1 1
df['col_a'].astype(str)

# as.character(df$col_a)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
Name: col_a, dtype: object
pd.Categorical(df['col_c'])

# factor(df$col_d)
[B, B, B, B, C, A, B, C, A, C]
Categories (3, object): [A, B, C]
df[['col_a', 'col_b']].sum(axis=1)

# R code:
# apply(df[, c('col_a', 'col_b')], 1, sum)
0     2.182928
1     0.169493
2     1.502998
3     4.485496
4     5.302028
5     5.480743
6     5.171749
7     5.228892
8     7.392292
9    10.938848
dtype: float64
df[['col_a', 'col_b']].mean(axis=0)

# R code:
# apply(df[, c('col_a', 'col_b')], 2, mean)
col_a    4.500000
col_b    0.285547
dtype: float64
df[['col_a', 'col_b']].apply(lambda x: x.mean() + 10)

# R code:
# apply(df[, c('col_a', 'col_b')], 2, function(x) mean(x) + 10)
col_a    14.500000
col_b    10.285547
dtype: float64
df2 = pd.DataFrame({'col_x': np.arange(10), 
                    'col_y': np.arange(10)[::-1]})
df2
col_x col_y
0 0 9
1 1 8
2 2 7
3 3 6
4 4 5
5 5 4
6 6 3
7 7 2
8 8 1
9 9 0
pd.concat([df, df2], axis=1)


# R code:
# cbind(df, df2)
col_a col_b col_c col_d col_x col_y
0 0 2.182928 B 1 0 9
1 1 -0.830507 B 0 1 8
2 2 -0.497002 B 0 2 7
3 3 1.485496 B 0 3 6
4 4 1.302028 C 1 4 5
5 5 0.480743 A 1 5 4
6 6 -0.828251 B 0 6 3
7 7 -1.771108 C 0 7 2
8 8 -0.607708 A 1 8 1
9 9 1.938848 C 1 9 0
df3 = pd.DataFrame({'col_a': [-1, -2], 
                    'col_b' : [0, 1], 
                    'col_c': ['B', 'C'], 
                    'col_d': [1, 0]})
df3
col_a col_b col_c col_d
0 -1 0 B 1
1 -2 1 C 0
pd.concat([df, df3], axis=0, ignore_index=True)

# R code:
# rbind(df, df3)
col_a col_b col_c col_d
0 0 2.182928 B 1
1 1 -0.830507 B 0
2 2 -0.497002 B 0
3 3 1.485496 B 0
4 4 1.302028 C 1
5 5 0.480743 A 1
6 6 -0.828251 B 0
7 7 -1.771108 C 0
8 8 -0.607708 A 1
9 9 1.938848 C 1
10 -1 0.000000 B 1
11 -2 1.000000 C 0
data =pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv',index_col=0)
data
carat cut color clarity depth table price x y z
1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
... ... ... ... ... ... ... ... ... ... ...
53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50
53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61
53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56
53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74
53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64

53940 rows × 10 columns

cor_matrix = data.corr()

data.corr()    
#可以直接给出数据框的相关系数矩阵
carat depth table price x y z
carat 1.000000 0.028224 0.181618 0.921591 0.975094 0.951722 0.953387
depth 0.028224 1.000000 -0.295779 -0.010647 -0.025289 -0.029341 0.094924
table 0.181618 -0.295779 1.000000 0.127134 0.195344 0.183760 0.150929
price 0.921591 -0.010647 0.127134 1.000000 0.884435 0.865421 0.861249
x 0.975094 -0.025289 0.195344 0.884435 1.000000 0.974701 0.970772
y 0.951722 -0.029341 0.183760 0.865421 0.974701 1.000000 0.952006
z 0.953387 0.094924 0.150929 0.861249 0.970772 0.952006 1.000000
data.corr()['price']  
#给出'price'变量与其他变量之间的相关系数
carat    0.921591
depth   -0.010647
table    0.127134
price    1.000000
x        0.884435
y        0.865421
z        0.861249
Name: price, dtype: float64
data['price'].corr(data["x"])    
#计算'price'与"x"之间的相关系数
0.8844351610161268
data.corr(method='spearman')   
carat depth table price x y z
carat 1.000000 0.030104 0.194980 0.962883 0.996117 0.995572 0.993183
depth 0.030104 1.000000 -0.245061 0.010020 -0.023442 -0.025425 0.103498
table 0.194980 -0.245061 1.000000 0.171784 0.202231 0.195734 0.159878
price 0.962883 0.010020 0.171784 1.000000 0.963196 0.962719 0.957232
x 0.996117 -0.023442 0.202231 0.963196 1.000000 0.997895 0.987355
y 0.995572 -0.025425 0.195734 0.962719 0.997895 1.000000 0.987068
z 0.993183 0.103498 0.159878 0.957232 0.987355 0.987068 1.000000
data.corr(method='pearson')['price']  
carat    0.921591
depth   -0.010647
table    0.127134
price    1.000000
x        0.884435
y        0.865421
z        0.861249
Name: price, dtype: float64
data['price'].corr(data["x"],method='pearson')  
#method也可以指定spearman法和kendall法计算相关系数。
0.8844351610161268
from numpy.random import rand
from numpy.random import seed
from scipy.stats import spearmanr
# seed random number generator
seed(1)
# prepare data
data1 = data['x']
data2 = data['price']
# calculate spearman's correlation
coef, p = spearmanr(data1, data2)
print('Spearmans correlation coefficient: %.3f' % coef)
Spearmans correlation coefficient: 0.963
# interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)

else:
    print('Samples are correlated (reject H0) p=%.3f' % p)
Samples are correlated (reject H0) p=0.000
p
0.0
from scipy.stats import kendalltau
# seed random number generator
seed(1)

# calculate kendall's correlation
coef, p = kendalltau(data1, data2)
print('Kendall correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
    print('Samples are correlated (reject H0) p=%.3f' % p)

Kendall correlation coefficient: 0.831
Samples are correlated (reject H0) p=0.000
from scipy import stats

from scipy.stats import pearsonr
# seed random number generator
seed(1)

# calculate pearsonr's correlation
coef, p = pearsonr(data1, data2)
print('pearsonr correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
    print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
    print('Samples are correlated (reject H0) p=%.3f' % p)
pearsonr correlation coefficient: 0.884
Samples are correlated (reject H0) p=0.000

发布了37 篇原创文章 · 获赞 4 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_39306047/article/details/105357059