本篇介绍以下几个知识点:
(1)利用pandas读取文件
(2)利用concat合并文件
(3)利用append合并文件
(4)利用append合并DataFrame与Series
(5)利用merge合并DataFrame
(6)merge处理重叠区域,handle overlapping
(1)利用pandas读取文件
1.1 没有指定文件的路径
import pandas as pd
import numpy as np
data = pd.read_csv('zuobiao.csv') #read_csv可以读取txt文件,也可以读取excel的csv格式。
print(data)
1.2 指定文件的具体路径
import pandas as pd
import numpy as np
data = pd.read_csv('F:/Python/poem.txt') #指定了文件的具体路径,注意斜杠的形式。
print(data)
(2)利用pandas的concat合并文件
2.1 concat的ignore_index元素
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ['a','b','c','d'])
print(df1,"\n",df2,'\n',df3)
#合并
res1 = pd.concat([df1,df2,df3],axis = 0) #axis = 1是横向合并
print(res1)
res2 = pd.concat([df1,df2,df3],axis = 0,ignore_index = True) #ignore_index = True意思为:索引按顺序来
print(res2)
2.2 concat的join元素
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
print(df1)
print("\n"*2)
print(df2)
res = pd.concat([df1,df2],join = 'inner',ignore_index = True) #outer
print(res)
2.3 concat的join_axes元素
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
res = pd.concat([df1,df2],axis = 1,join_axes = [df1.index])##不要join_axes
print(df1)
print(df2)
print("\n"*2)
print(res)
(3)利用pandas的append合并文件
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
res = df1.append(df2,ignore_index = True)
print(df1)
print(df2)
print("\n"*2)
print(res)
append直接在第一个的结尾附加第二个
(4)利用append合并DataFrame与Series
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
s1 = pd.Series([1,2,3,4],index = ['a','b','c','d'])
res = df1.append(s1,ignore_index = True)
print(df1)
print(s1)
print("\n"*2)
print(res)
(5)利用merge合并DataFrame
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
res = pd.merge(left,right,on = 'key')
print(left)
print(right)
print("\n"*2)
print(res)
import numpy as np
import pandas as pd
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
res = pd.merge(left,right,on = 'key')
##res = pd.merge(left,right,on = ['key1','key2']) ##只考虑相同的值,交集
##print(res)
##res = pd.merge(left,right,on = ['key1','key2'],how = 'outer') ##不论相同与否,合并两个key,并集。不同的用nan
##print(res)
##res = pd.merge(left,right,on = ['key1','key2'],how = 'right') #基于right
##print(res)
print(left)
print(right)
print("\n"*2)
print(res)
merge的indicator元素
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
res = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator = 'df')#indicator = True
print(df1)
print(df2)
print("\n"*2)
print(res)
merged by index
import numpy as np
import pandas as pd
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C1','C2'],
'D':['D0','D1','D2']},
index = ['K0','K1','K2'])
res = pd.merge(left,right,left_index = True,right_index = True,how = 'outer')
#res = pd.merge(left,right,left_index = True,right_index = True,how = 'inner')
print(left)
print(right)
print("\n"*2)
print(res)
(6)处理重叠区域,handle overlapping
import numpy as np
import pandas as pd
#handle overlapping
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K1','K2'],'age':[4,5,6]})
res = pd.merge(boys,girls,on = 'k',suffixes = ['_boy','_girl'],how = 'inner') #outer
#两个都有k,合并后有一个公共的k
print(boys)
print(girls)
print("\n"*2)
print(res)
附加本次学习的所有原始代码:
'''
import pandas as pd
import numpy as np
data = pd.read_csv('zuobiao.csv')
print(data)
data.to_pickle('student.pickle')
#合并DataFrame,concatenaing
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ['a','b','c','d'])
#print(df1,"\n",df2,'\n',df3)
#上下合并
res = pd.concat([df1,df2,df3],axis = 0) #1是横向
##print(res)
##
##res2 = pd.concat([df1,df2,df3],axis = 0,ignore_index = True) #0是横向
##print(res2)
# join,['inner','outer']
##df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
##df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
##res = pd.concat([df1,df2],join = 'inner',ignore_index = True) #outer
##print(res)
## join_axes
##df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
##df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
##res = pd.concat([df1,df2],axis = 1,join_axes = [df1.index])##不要join_axes
##print(res)
##append
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
res = df1.append(df2,ignore_index = True)
print(res)
##DataFrame加Series
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
s1 = pd.Series([1,2,3,4],index = ['a','b','c','d'])
res = df1.append(s1,ignore_index = True)
print(res)
'''
# 合并merge
import pandas as pd
import numpy as np
'''
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
print("\n"*3)
res = pd.merge(left,right,on = 'key')
print(res)
'''
##有两个key
##left = pd.DataFrame({'key1':['K0','K1','K2','K3'],
## 'key2':['K0','K1','K0','K1'],
## 'A':['A0','A1','A2','A3'],
## 'B':['B0','B1','B2','B3']})
##
##right = pd.DataFrame({'key1':['K0','K1','K2','K3'],
## 'key2':['K0','K0','K0','K0'],
## 'C':['C0','C1','C2','C3'],
## 'D':['D0','D1','D2','D3']})
##
##print(left)
##print(right)
##print('\n'*3)
##res = pd.merge(left,right,on = ['key1','key2']) ##只考虑相同的值,交集
##print(res)
##res = pd.merge(left,right,on = ['key1','key2'],how = 'outer') ##,基于两个的key,并集。不同的用nan
##print(res)
##res = pd.merge(left,right,on = ['key1','key2'],how = 'right') #基于right
##print(res)
###indicator参数
##df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
##df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
##print(df1)
##print(df2)
##print('\n'*3)
##
##res = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator = 'df')#indicator = True
##print(res )
#merged by index
##left = pd.DataFrame({'A':['A0','A1','A2'],
## 'B':['B0','B1','B2']},
## index = ['K0','K1','K2'])
##right = pd.DataFrame({'C':['C0','C1','C2'],
## 'D':['D0','D1','D2']},
## index = ['K0','K1','K2'])
##
##print(left)
##print(right)
##print("\n"*3)
##res = pd.merge(left,right,left_index = True,right_index = True,how = 'outer')
###res = pd.merge(left,right,left_index = True,right_index = True,how = 'inner')
##print(res)
#handle overlapping
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K1','K2'],'age':[4,5,6]})
res = pd.merge(boys,girls,on = 'k',suffixes = ['_boy','_girl'],how = 'inner') #outer
print(res)
#join功能
#plt模块画图