import pandas as pd
import numpy as np
df = pd. DataFrame( np. arange( 12 ) . reshape( 3 , 4 ) , columns= [ 'A' , 'B' , 'C' , 'D' ] )
df
A
B
C
D
0
0
1
2
3
1
4
5
6
7
2
8
9
10
11
df. drop( columns= [ 'B' , 'C' ] )
df. drop( [ 0 , 1 ] )
a = [ [ 1 , np. nan, 2 ] , [ np. nan, None , np. nan] , [ 3 , None , None ] , [ 5 , None , 7 ] ]
data = pd. DataFrame( a)
print ( data)
0 1 2
0 1.0 NaN 2.0
1 NaN NaN NaN
2 3.0 NaN NaN
3 5.0 NaN 7.0
print ( data. dropna( how= "all" ) )
0 1 2
0 1.0 NaN 2.0
2 3.0 NaN NaN
3 5.0 NaN 7.0
print ( data. dropna( how= "all" , axis= 1 ) )
0 2
0 1.0 2.0
1 NaN NaN
2 3.0 NaN
3 5.0 7.0
data. dropna( )
print ( data. fillna( data. mean( ) ) )
0 1 2
0 1.0 NaN 2.0
1 3.0 NaN 4.5
2 3.0 NaN 4.5
3 5.0 NaN 7.0
from sklearn import preprocessing
import numpy as np
X_train = np. array( [ [ 1 . , - 1 . , 2 . ] ,
[ 2 . , 0 . , 0 . ] ,
[ 0 . , 1 . , - 1 . ] ] )
X_scaled = preprocessing. scale( X_train)
X_scaled
array([[ 0. , -1.22474487, 1.33630621],
[ 1.22474487, 0. , -0.26726124],
[-1.22474487, 1.22474487, -1.06904497]])
X_scaled. mean( axis= 0 )
array([0., 0., 0.])
X_scaled. std( axis= 0 )
array([1., 1., 1.])
from sklearn. preprocessing import StandardScaler
scaler = preprocessing. StandardScaler( ) . fit( X_train)
scaler
StandardScaler(copy=True, with_mean=True, with_std=True)
scaler. mean_
array([1. , 0. , 0.33333333])
scaler. scale_
array([0.81649658, 0.81649658, 1.24721913])
s1= pd. Series( [ 1 , 2 , 3 ] , index= list ( 'abc' ) )
s2= pd. Series( [ 3 , 4 , 5 ] , index= list ( 'bde' ) )
s1
a 1
b 2
c 3
dtype: int64
pd. concat( [ s1, s2] )
a 1
b 2
c 3
b 3
d 4
e 5
dtype: int64
pd. concat( [ s1, s2] , axis= 1 )
0
1
a
1.0
NaN
b
2.0
3.0
c
3.0
NaN
d
NaN
4.0
e
NaN
5.0
pd. concat( [ s1, s2] , axis= 1 ) . dropna( )
pd. concat( [ s1, s2] , axis= 1 , join= 'inner' )
dict1= {
'key' : [ 'a' , 'b' , 'c' ] ,
'col1' : range ( 3 )
}
df1 = pd. DataFrame( dict1)
df1
key
col1
0
a
0
1
b
1
2
c
2
dict2= {
'key' : [ 'b' , 'c' , 'd' ] ,
'col2' : range ( 1 , 4 )
}
df2 = pd. DataFrame( dict2)
df2
key
col2
0
b
1
1
c
2
2
d
3
dat = pd. merge( df1, df2)
dat
key
col1
col2
0
b
1
1
1
c
2
2
"""
Created on Thu Nov 29 01:33:46 2018
@author: czh
"""
% clear
% reset - f
import seaborn as sns
import matplotlib. pyplot as plt
% matplotlib inline
import os
import numpy as np
import pandas as pd
data = pd. DataFrame( { 'k1' : [ 'one' , 'two' ] * 3 + [ 'two' ] ,
'k2' : [ 1 , 1 , 2 , 3 , 3 , 4 , 4 ] } )
data
k1
k2
0
one
1
1
two
1
2
one
2
3
two
3
4
one
3
5
two
4
6
two
4
data. duplicated( )
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data. drop_duplicates( )
k1
k2
0
one
1
1
two
1
2
one
2
3
two
3
4
one
3
5
two
4
data[ 'v1' ] = range ( 7 )
data
k1
k2
v1
0
one
1
0
1
two
1
1
2
one
2
2
3
two
3
3
4
one
3
4
5
two
4
5
6
two
4
6
data. drop_duplicates( [ 'k1' ] )
k1
k2
v1
0
one
1
0
1
two
1
1
url_csv = 'https://vincentarelbundock.github.io/Rdatasets/csv/boot/amis.csv'
df = pd. read_csv( url_csv)
df
Unnamed: 0
speed
period
warning
pair
0
1
26
1
1
1
1
2
26
1
1
1
2
3
26
1
1
1
3
4
26
1
1
1
4
5
27
1
1
1
...
...
...
...
...
...
8432
8433
42
3
2
14
8433
8434
46
3
2
14
8434
8435
47
3
2
14
8435
8436
48
3
2
14
8436
8437
47
3
2
14
8437 rows × 5 columns
df = pd. read_csv( url_csv, index_col= 0 )
df. head( )
speed
period
warning
pair
1
26
1
1
1
2
26
1
1
1
3
26
1
1
1
4
26
1
1
1
5
27
1
1
1
df = pd. read_csv( url_csv, index_col= 'speed' )
df
Unnamed: 0
period
warning
pair
speed
26
1
1
1
1
26
2
1
1
1
26
3
1
1
1
26
4
1
1
1
27
5
1
1
1
...
...
...
...
...
42
8433
3
2
14
46
8434
3
2
14
47
8435
3
2
14
48
8436
3
2
14
47
8437
3
2
14
8437 rows × 4 columns
csv_url = 'http://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv'
df = pd. read_csv( csv_url, index_col= 'idNum' )
df. iloc[ : , 0 : 6 ] . head( )
Unnamed: 0
date
problem
MDC
citationIssued
personSearch
idNum
17-000003
6823
2017-01-01 00:00:42
suspicious
MDC
NaN
NO
17-000007
6824
2017-01-01 00:03:07
suspicious
MDC
NaN
NO
17-000073
6825
2017-01-01 00:23:15
traffic
MDC
NaN
NO
17-000092
6826
2017-01-01 00:33:48
suspicious
MDC
NaN
NO
17-000098
6827
2017-01-01 00:37:58
traffic
MDC
NaN
NO
''''''读取CSV和缺失值
如果我们的CSV文件中缺少数据存在缺失数据,我们可以使用参数na_values。 在下面的示例中有一些单元格的字符串为“Not Available”。
df = pd.read_csv('Simdata/MissingData.csv', index_col=0,
na_values="Not Available")
df.head()
跳过行读取CSV
例如,我们如何跳过文件中的前三行,如下所示:
Pandas read_csv跳过示例:
df = pd.read_csv('Simdata/skiprow.csv', index_col=0, skiprows=3)
df.head()
我们可以使用header参数获得与上面相同的结果(即data = pd.read_csv('Simdata / skiprow.csv',header = 3))。
如何使用Pandas读取某些行
如果我们不想读取CSV文件中的每一行,我们可以使用参数nrows。 在下面的下一个示例中,我们读取了CSV文件的前8行。
df = pd.read_csv(url_csv, nrows=8)
df
将多个文件加载到Dataframe
如果我们有来自许多来源的数据,如果要同时分析来自不同CSV文件的数据,我们可能希望将它们全部加载到一个数据帧中。在接下来的示例中,我们将使用Pandas read_csv来读取多个文件。
首先,我们将使用Python os和fnmatch在“SimData”目录中列出文件类型为CSV的“Day”字样的所有文件。接下来,我们使用Python列表理解将CSV文件加载到数据帧中(存储在列表中,请参阅类型(dfs)输出)。
import os, fnmatch
csv_files = fnmatch.filter(os.listdir('./SimData'), '*Day*.csv')
dfs = [pd.read_csv('SimData/' + os.sep + csv_file)
for csv_file in csv_files]
type(dfs)
# Output: list
最后,我们使用方法concat来连接列表中的数据帧。 在示例文件中有一个名为“Day”的列,因此每天(即CSV文件)都是唯一的。
df = pd.concat(dfs, sort=False)
df.Day.unique()
我们要使用的第二种方法有点简单. 如果我们比较两种方法(os + fnmatch与glob),我们可以看到在我们不必放置路径。 这是因为glob将拥有我们文件的完整路径。 便利!
import glob
csv_files = glob.glob('SimData/*Day*.csv')
dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
df = pd.concat(dfs, sort=False)
如果我们在每个CSV文件中没有列,确定它是哪个数据集(例如,来自不同日期的数据),我们可以在每个数据框的新列中应用文件名:
import glob
csv_files = glob.glob('SimData/*Day*.csv')
dfs = []
for csv_file in csv_files:
temp_df = pd.read_csv(csv_file)
temp_df['DataF'] = csv_file.split('\\')[1]
dfs.append(temp_df)
''' '' '
File "<ipython-input-45-18c31be4d50b>", line 1
''''''读取CSV和缺失值
^
SyntaxError: invalid syntax
df = pd. DataFrame( { 'Names' : [ 'Andreas' , 'George' , 'Steve' ,
'Sarah' , 'Joanna' , 'Hanna' ] ,
'Age' : [ 21 , 22 , 20 , 19 , 18 , 23 ] } )
df. head( )
Names
Age
0
Andreas
21
1
George
22
2
Steve
20
3
Sarah
19
4
Joanna
18
df. to_csv( 'G:/ceshishuju/NamesAndAges.csv' )
'''如何将多个数据帧读取到一个csv文件中
如果我们有许多数据帧,并且我们想将它们全部导出到同一个csv文件中。
这是为了创建两个新的列,命名为group和row num。重要的部分是group,它将标识不同的数据帧。在代码示例的最后一行中,我们使用pandas将数据帧写入csv。
'''
df1 = pd. DataFrame( { 'Names' : [ 'Andreas' , 'George' , 'Steve' ,
'Sarah' , 'Joanna' , 'Hanna' ] ,
'Age' : [ 21 , 22 , 20 , 19 , 18 , 23 ] } )
df2 = pd. DataFrame( { 'Names' : [ 'Pete' , 'Jordan' , 'Gustaf' ,
'Sophie' , 'Sally' , 'Simone' ] ,
'Age' : [ 22 , 21 , 19 , 19 , 29 , 21 ] } )
df3 = pd. DataFrame( { 'Names' : [ 'Ulrich' , 'Donald' , 'Jon' ,
'Jessica' , 'Elisabeth' , 'Diana' ] ,
'Age' : [ 21 , 21 , 20 , 19 , 19 , 22 ] } )
df = pd. concat( [ df1, df2, df3] , keys = [ 'Group1' , 'Group2' , 'Group3' ] ,
names= [ 'Group' , 'Row Num' ] ) . reset_index( )
df. to_csv( 'G:/ceshishuju/MultipleDfs.csv' , index= False )
df1= pd. DataFrame( { 'key' : [ 'a' , 'b' , 'b' ] , 'data1' : range ( 3 ) } )
df1
key
data1
0
a
0
1
b
1
2
b
2
df2= pd. DataFrame( { 'key' : [ 'a' , 'b' , 'c' ] , 'data2' : range ( 3 ) } )
df2
key
data2
0
a
0
1
b
1
2
c
2
pd. merge( df1, df2)
key
data1
data2
0
a
0
0
1
b
1
1
2
b
2
1
pd. merge( df2, df1)
key
data2
data1
0
a
0
0
1
b
1
1
2
b
1
2
pd. merge( df2, df1, how= 'left' )
key
data2
data1
0
a
0
0.0
1
b
1
1.0
2
b
1
2.0
3
c
2
NaN
left= pd. DataFrame( {
'key1' : [ 'foo' , 'foo' , 'bar' ] ,
'key2' : [ 'one' , 'two' , 'one' ] ,
'lval' : [ 1 , 2 , 3 ]
} )
left
key1
key2
lval
0
foo
one
1
1
foo
two
2
2
bar
one
3
right= pd. DataFrame( {
'key1' : [ 'foo' , 'foo' , 'bar' , 'bar' ] ,
'key2' : [ 'one' , 'one' , 'one' , 'two' ] ,
'lval' : [ 4 , 5 , 6 , 7 ]
} )
right
key1
key2
lval
0
foo
one
4
1
foo
one
5
2
bar
one
6
3
bar
two
7
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'outer' )
key1
key2
lval_x
lval_y
0
foo
one
1.0
4.0
1
foo
one
1.0
5.0
2
foo
two
2.0
NaN
3
bar
one
3.0
6.0
4
bar
two
NaN
7.0
df3= pd. DataFrame( { 'key3' : [ 'foo' , 'foo' , 'bar' , 'bar' ] ,
'key4' : [ 'one' , 'one' , 'one' , 'two' ] ,
'lval' : [ 4 , 5 , 6 , 7 ] } )
pd. merge( left, df3, left_on= 'key1' , right_on= 'key3' )
key1
key2
lval_x
key3
key4
lval_y
0
foo
one
1
foo
one
4
1
foo
one
1
foo
one
5
2
foo
two
2
foo
one
4
3
foo
two
2
foo
one
5
4
bar
one
3
bar
one
6
5
bar
one
3
bar
two
7
如果需要合并的两个数据框,一个是其中一列,一个是数据框的index,则使用 left_index= True 或 right_index= True ,来声明某个数据的索引应该被当做键值,基本语句为:merge( D1, D2, left_on= 'id' , right_index= True )
File "<ipython-input-65-c48531e09604>", line 1
如果需要合并的两个数据框,一个是其中一列,一个是数据框的index,则使用 left_index=True 或 right_index=True,来声明某个数据的索引应该被当做键值,基本语句为:merge(D1, D2, left_on='id', right_index=True)
^
SyntaxError: invalid character in identifier
df1= pd. DataFrame( np. random. randn( 3 , 4 ) , columns= [ 'a' , 'b' , 'c' , 'd' ] )
df1
a
b
c
d
0
-0.840861
-0.136042
1.454791
-1.098472
1
3.100038
-0.764239
0.358622
-1.186493
2
-0.314907
-0.400432
-0.271775
1.561945
df2= pd. DataFrame( np. random. randn( 2 , 3 ) , columns= [ 'b' , 'd' , 'a' ] )
df2
b
d
a
0
-0.254838
0.270791
-1.008517
1
-1.602377
-0.858221
0.182415
pd. concat( [ df1, df2] )
a
b
c
d
0
-0.840861
-0.136042
1.454791
-1.098472
1
3.100038
-0.764239
0.358622
-1.186493
2
-0.314907
-0.400432
-0.271775
1.561945
0
-1.008517
-0.254838
NaN
0.270791
1
0.182415
-1.602377
NaN
-0.858221
pd. concat( [ df1, df2] , ignore_index= True )
a
b
c
d
0
-0.840861
-0.136042
1.454791
-1.098472
1
3.100038
-0.764239
0.358622
-1.186493
2
-0.314907
-0.400432
-0.271775
1.561945
3
-1.008517
-0.254838
NaN
0.270791
4
0.182415
-1.602377
NaN
-0.858221
'''笔记
要选取所有数字类的列,请使用np.number或'number'
要选取字符串的列,必须使用‘object’
要选择日期时间,请使用np.datetime64,'datetime'或'datetime64'
要选取所有属性为‘类’的列,请使用“category”
'''
df = pd. DataFrame( { 'a' : [ 1 , 2 ] * 3 ,
'b' : [ True , False ] * 3 ,
'c' : [ 1.0 , 2.0 ] * 3 ,
'e' : [ 'asian' , 'white' , 'black' , 'white' , 'asian' , 'white' ] ,
'd' : [ 'low' , 'low' , 'low' , 'median' , 'high' , 'high' ] } )
df[ 'd' ] = df[ 'd' ] . astype( 'category' )
df
a
b
c
e
d
0
1
True
1.0
asian
low
1
2
False
2.0
white
low
2
1
True
1.0
black
low
3
2
False
2.0
white
median
4
1
True
1.0
asian
high
5
2
False
2.0
white
high
df. select_dtypes( include= 'bool' )
df. select_dtypes( include= [ 'float64' ] )
df. select_dtypes( include= [ 'number' ] )
df. select_dtypes( include= [ 'category' ] )
df. select_dtypes( include= [ 'integer' ] )
df. select_dtypes( include= [ 'object' ] )
e
0
asian
1
white
2
black
3
white
4
asian
5
white
df. select_dtypes( exclude= [ 'float64' ] )
a
b
e
d
0
1
True
asian
low
1
2
False
white
low
2
1
True
black
low
3
2
False
white
median
4
1
True
asian
high
5
2
False
white
high