版权声明:未经本人同意,禁止抄袭 https://blog.csdn.net/xiaolong_4_2/article/details/86509594
用python进行数据分析处理时,常用到的两个库就是numpy和pandas,以下是些常用的数据处理方法。
合并:
使用一个或多个键,将多行数据连接在一起。执行合并操作的函数为merge()。
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a':[2,4,7,9],'b':[2,0,2,33],'f':[1,1,4,5]})
#合并(内连接)
so=pd.merge(frame,frame1,on='a')
print(so)
#外连接
so1=pd.merge(frame,frame1,on='a',how='outer')
print(so1)
a b_x c b_y f
0 2 3 7 2 1
1 4 5 9 0 1
a b_x c b_y f
0 1 2.0 6.0 NaN NaN
1 2 3.0 7.0 2.0 1.0
2 3 4.0 8.0 NaN NaN
3 4 5.0 9.0 0.0 1.0
4 7 NaN NaN 2.0 4.0
5 9 NaN NaN 33.0 5.0
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a':[2,4,7,9],'b':[2,0,2,33],'f':[1,1,4,5]})
#合并(左连接)
so=pd.merge(frame,frame1,on='a',how='left')
print(so)
#右连接
so1=pd.merge(frame,frame1,on='a',how='right')
print(so1)
a b_x c b_y f
0 1 2 6 NaN NaN
1 2 3 7 2.0 1.0
2 3 4 8 NaN NaN
3 4 5 9 0.0 1.0
a b_x c b_y f
0 2 3.0 7.0 2 1
1 4 5.0 9.0 0 1
2 7 NaN NaN 2 4
3 9 NaN NaN 33 5
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a':[2,4,7,9],'b':[2,0,2,33],'f':[1,1,4,5]})
#合并(多个键作为合并的标准)
so=pd.merge(frame,frame1,on=['a','b'],how='outer')
print(so)
a b c f
0 1 2 6.0 NaN
1 2 3 7.0 NaN
2 3 4 8.0 NaN
3 4 5 9.0 NaN
4 2 2 NaN 1.0
5 4 0 NaN 1.0
6 7 2 NaN 4.0
7 9 33 NaN 5.0
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a1':[2,4,7,9],'b1':[2,0,2,33],'f':[1,1,4,5]})
#合并(按照索引合并)--不是以列进行合并
so=frame.join(frame1)
print(so)
a b c a1 b1 f
0 1 2 6 2 2 1
1 2 3 7 4 0 1
2 3 4 8 7 2 4
3 4 5 9 9 33 5
拼接:
将数据结构按照x轴或y轴进行拼接。拼接所用的函数为concat()。
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a1':[2,4,7,9],'b1':[2,0,2,33],'f':[1,1,4,5]})
se1=pd.Series(np.random.rand(4),index=[1,2,3,4])
se2=pd.Series(np.random.rand(4),index=[5,6,7,8])
#连接两个Series对象(返回一个Series对象)
so=pd.concat([se1,se2],axis=0)
print(so)
#连接两个Series对象(返回一个DataFrame对象)
so1=pd.concat([se1,se2],axis=1)
print(so1)
1 0.760366
2 0.946837
3 0.762230
4 0.817649
5 0.550010
6 0.370044
7 0.348296
8 0.862112
dtype: float64
0 1
1 0.760366 NaN
2 0.946837 NaN
3 0.762230 NaN
4 0.817649 NaN
5 NaN 0.550010
6 NaN 0.370044
7 NaN 0.348296
8 NaN 0.862112
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a1':[2,4,7,9],'b1':[2,0,2,33],'f':[1,1,4,5]})
se1=pd.Series(np.random.rand(4),index=[1,2,3,4])
se2=pd.Series(np.random.rand(4),index=[5,6,7,8])
#连接两个Series对象(返回一个Series对象)---指定keys(等级索引)
so=pd.concat([se1,se2],axis=0,keys=[1,2])
print(so)
#连接两个Series对象(返回一个DataFrame对象)--指定keys的列
so1=pd.concat([se1,se2],axis=1,keys=[1,2])
print(so1)
1 1 0.936198
2 0.448478
3 0.269417
4 0.011118
2 5 0.163403
6 0.714117
7 0.052092
8 0.869981
dtype: float64
1 2
1 0.936198 NaN
2 0.448478 NaN
3 0.269417 NaN
4 0.011118 NaN
5 NaN 0.163403
6 NaN 0.714117
7 NaN 0.052092
8 NaN 0.869981
DataFrame对象拼接的方法与Series对象相同。
组合:
对两个Series对象(索引完全相同或部分相同)进行组合。组合所用的函数为combine_first()。
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a1':[2,4,7,9],'b1':[2,0,2,33],'f':[1,1,4,5]})
se1=pd.Series(np.random.rand(4),index=[1,2,3,4])
print(se1)
se2=pd.Series(np.random.rand(4),index=[5,2,7,3])
print(se2)
#合并两个Series对象
so=se1.combine_first(se2)
print(so)
1 0.124557
2 0.701502
3 0.839564
4 0.760734
dtype: float64
5 0.604877
2 0.012177
7 0.132469
3 0.896560
dtype: float64
1 0.124557
2 0.701502
3 0.839564
4 0.760734
5 0.604877
7 0.132469
dtype: float64
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
frame1=pd.DataFrame({'a1':[2,4,7,9],'b1':[2,0,2,33],'f':[1,1,4,5]})
se1=pd.Series(np.random.rand(4),index=[1,2,3,4])
print(se1)
se2=pd.Series(np.random.rand(4),index=[5,2,7,3])
print(se2)
#合并两个Series对象(部分合并)
so=se1[0:2].combine_first(se2[0:3])
print(so)
1 0.016991
2 0.774237
3 0.856929
4 0.847897
dtype: float64
5 0.255458
2 0.302640
7 0.790559
3 0.112020
dtype: float64
1 0.016991
2 0.774237
5 0.255458
7 0.790559
dtype: float64
轴向旋转:
按行或列重新调整元素。轴向旋转所用的函数为stack()、unstack()(轴向旋转)、pivot()(长格式的DataFrame转化为宽格式)。
import numpy as np
import pandas as pd
frame=pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[6,7,8,9]})
print(frame)
frame1=pd.DataFrame({'a1':[2,4,7,9],'b1':[2,0,2,33],'f':[1,1,4,5]})
#将DataFrame转换为Series(具有层级索引)
se1=frame.stack()
print(se1)
#将Series转换回DataFrame
so=se1.unstack()
print(so)
#
so1=se1.unstack(0)
print(so1)
a b c
0 1 2 6
1 2 3 7
2 3 4 8
3 4 5 9
0 a 1
b 2
c 6
1 a 2
b 3
c 7
2 a 3
b 4
c 8
3 a 4
b 5
c 9
dtype: int64
a b c
0 1 2 6
1 2 3 7
2 3 4 8
3 4 5 9
0 1 2 3
a 1 2 3 4
b 2 3 4 5
c 6 7 8 9
import numpy as np
import pandas as pd
data={'color':['write','write','write','blank','blank','blank','red','red','red'],'name':['w','e','f','e','f','w','f','e','w'],'value':[1,2,2,6,9,2,4,7,7]}
frame=pd.DataFrame(data)
print(frame)
#将长格式的DataFrame转化为宽格式
ge=frame.pivot('color','name')
print(ge)
color name value
0 write w 1
1 write e 2
2 write f 2
3 blank e 6
4 blank f 9
5 blank w 2
6 red f 4
7 red e 7
8 red w 7
value
name e f w
color
blank 6 9 2
red 7 4 7
write 2 2 1
删除列:
import numpy as np
import pandas as pd
data={'color':['write','write','write','blank','blank','blank','red','red','red'],'name':['w','e','f','e','f','w','f','e','w'],'value':[1,2,2,6,9,2,4,7,7]}
frame=pd.DataFrame(data)
print(frame)
#删除列
del frame['color']
print(frame)
color name value
0 write w 1
1 write e 2
2 write f 2
3 blank e 6
4 blank f 9
5 blank w 2
6 red f 4
7 red e 7
8 red w 7
name value
0 w 1
1 e 2
2 f 2
3 e 6
4 f 9
5 w 2
6 f 4
7 e 7
8 w 7
二:数据转换
在调整了数据的形式和结构之后,就需要对数据进一步处理(对无效、重复、缺失数据进行替换或删除等处理)。--droplicates()过滤重复元素、drop_duplicates()删除重复元素、
过滤和删除重复元素:
import numpy as np
import pandas as pd
data={'color':['write','write','write','blank','blank','blank','red','red','red'],'value':[1,2,2,6,9,2,4,7,7]}
frame=pd.DataFrame(data)
print(frame)
#过滤重复元素
frame1=frame[frame.duplicated()]
print(frame1)
#删除重复元素
frame2=frame.drop_duplicates()
print(frame2)
color value
0 write 1
1 write 2
2 write 2
3 blank 6
4 blank 9
5 blank 2
6 red 4
7 red 7
8 red 7
color value
2 write 2
8 red 7
color value
0 write 1
1 write 2
3 blank 6
4 blank 9
5 blank 2
6 red 4
7 red 7
映射:
1、替换数据结构的元素(replace())
import numpy as np
import pandas as pd
data={'color':['write','write','write','blank','red','red'],'value':[1,2,2,6,9,7]}
frame=pd.DataFrame(data)
print(frame)
#替换元素
data1={'write':'blue',1:99}
frame1=frame.replace(data1)
print(frame1)
color value
0 write 1
1 write 2
2 write 2
3 blank 6
4 red 9
5 red 7
color value
0 blue 99
1 blue 2
2 blue 2
3 blank 6
4 red 9
5 red 7
import numpy as np
import pandas as pd
se1=pd.Series([1,np.nan,3,4,np.nan])
print(se1)
#替换元素(把NaN换成0)
se2=se1.replace(np.nan,0)
print(se2)
0 1.0
1 NaN
2 3.0
3 4.0
4 NaN
dtype: float64
0 1.0
1 0.0
2 3.0
3 4.0
4 0.0
dtype: float64
2、从另一个数据结构中获取数据,并赋值到目标数据结构的列中:
import numpy as np
import pandas as pd
data={'color':['write','write','write','blank','red','red'],'value':[1,2,2,6,9,7]}
frame=pd.DataFrame(data)
print(frame)
#从另一个数据结构中获取数据,并赋值到目标数据结构的列中
price={'write':22,'blank':99,'red':77}
frame['price']=frame['color'].map(price)
print(frame)
color value
0 write 1
1 write 2
2 write 2
3 blank 6
4 red 9
5 red 7
color value price
0 write 1 22
1 write 2 22
2 write 2 22
3 blank 6 99
4 red 9 77
5 red 7 77
3、重命名轴标签:
import numpy as np
import pandas as pd
data={'color':['write','write','write','blank','red','red'],'value':[1,2,2,6,9,7]}
frame=pd.DataFrame(data)
print(frame)
#替换标签
reindex={0:'zero',1:'first',2:'second',3:'third',4:'forth',5:'fifth'}
frame1=frame.rename(reindex)
print(frame1)
#替换标签和列名
frame2=frame.rename(index={1:'first'},columns={'value':'value1'})
print(frame2)
color value
0 write 1
1 write 2
2 write 2
3 blank 6
4 red 9
5 red 7
color value
zero write 1
first write 2
second write 2
third blank 6
forth red 9
fifth red 7
color value1
0 write 1
first write 2
2 write 2
3 blank 6
4 red 9
5 red 7
离散化和面元划分:
import numpy as np
import pandas as pd
result=[1,2,3,5,6,7,8,9,0,3,34,5,4,2,31,22,11,66]
#离散化和面元划分
ca=pd.cut(result,bins=[0,25,50,75,100])
print(ca)
#labels是result中的元素所属的面元
print(ca.labels)
#每个面元出现的次数
cat=pd.value_counts(ca)
print(ca
Length: 18
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]
[ 0 0 0 0 0 0 0 0 -1 0 1 0 0 0 1 0 0 2]
(0, 25] 14
(25, 50] 2
(50, 75] 1
(75, 100] 0
dtype: int64
import numpy as np
import pandas as pd
result=[1,2,3,5,6,7,8,9,0,3,34,5,4,2,31,22,11,66]
#离散化和面元划分
bin_names=['first','second','third','forth']
ca=pd.cut(result,bins=[0,25,50,75,100],labels=bin_names)
print(ca)
[first, first, first, first, first, ..., first, second, first, first, third]
Length: 18
Categories (4, object): [first < second < third < forth]
import numpy as np
import pandas as pd
result=[1,2,3,5,6,7,8,9,0,3,34,5,4,2,31,22,11,66]
#离散化和面元划分
ca=pd.qcut(result,5)
print(ca)
[(-0.001, 2.4], (-0.001, 2.4], (2.4, 4.8], (4.8, 7.2], (4.8, 7.2], ..., (-0.001, 2.4], (17.6, 66.0], (17.6, 66.0], (7.2, 17.6], (17.6, 66.0]]
Length: 18
Categories (5, interval[float64]): [(-0.001, 2.4] < (2.4, 4.8] < (4.8, 7.2] < (7.2, 17.6] <
(17.6, 66.0]]
排序和随机取样:
import numpy as np
import pandas as pd
data={'a':[1,2,3,4,5],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
print(frame)
#按行进行排序
frame1=frame.take([2,1,4])
print(frame1)
a b c
0 1 4 6
1 2 5 7
2 3 6 8
3 4 7 9
4 5 8 0
a b c
2 3 6 8
1 2 5 7
4 5 8 0
import numpy as np
import pandas as pd
data={'a':[1,2,3,4,5],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
print(frame)
#从DataFrame中随机抽取多行的数据
a=np.random.randint(0,len(frame),size=3)
frame1=frame.take(a)
print(frame1)
a b c
0 1 4 6
1 2 5 7
2 3 6 8
3 4 7 9
4 5 8 0
a b c
2 3 6 8
1 2 5 7
4 5 8 0
三:数据聚合(GroupBy)
对数据集进行分组(按行或按列),对每一组的数据进行函数处理,把不同组得到的结果合并成一个新对象。
import numpy as np
import pandas as pd
data={'color':['write','write','green','green','yellow'],'item':['ben','an','qq','qq','ee'],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
print(frame)
#按照color进行分组,对b列进行处理操作
res=frame['b'].groupby(frame['color'])
#分组后,对每一组做平均数处理
print(res.mean())
b c color item
0 4 6 write ben
1 5 7 write an
2 6 8 green qq
3 7 9 green qq
4 8 0 yellow ee
color
green 6.5
write 4.5
yellow 8.0
Name: b, dtype: float64
等级分组:
import numpy as np
import pandas as pd
data={'color':['write','write','green','green','yellow'],'item':['ben','an','qq','qq','ee'],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
print(frame)
#按照color和item进行等级分组,对列b进行处理
res=frame['b'].groupby([frame['color'],frame['item']])
#分组后,对每一组做求和处理
print(res.sum())
b c color item
0 4 6 write ben
1 5 7 write an
2 6 8 green qq
3 7 9 green qq
4 8 0 yellow ee
color item
green qq 13
write an 5
ben 4
yellow ee 8
Name: b, dtype: int64
链式转换(DataFrame之间存在生成关系):
import numpy as np
import pandas as pd
data={'color':['write','write','green','green','yellow'],'item':['ben','an','qq','qq','ee'],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
#按照color和item进行等级分组,对列b进行处理
res=frame.groupby([frame['color'],frame['item']])
#若采用的是链式转换过程(DataFrame转换前后存在生成关系),而且需要保留与源数据的对应关系,则可以使用该方法
print(res.sum().add_prefix('add12'))
add12b add12c
color item
green qq 13 17
write an 5 7
ben 4 6
yellow ee 8 0
分组函数:
import numpy as np
import pandas as pd
data={'color':['write','write','green','green','yellow'],'item':['ben','an','qq','qq','ee'],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
#按照color和item进行等级分组,对列b进行处理
res=frame['b'].groupby(frame['color'])
#分组函数(计算分位数)
print(res.quantile())
#agg()函数--agg()的参数可以是自定义的函数(mean计算平均数,std计算标准差)
res1=res.agg(['mean','std'])
print(res1)
color
green 6.5
write 4.5
yellow 8.0
Name: b, dtype: float64
mean std
color
green 6.5 0.707107
write 4.5 0.707107
yellow 8.0 NaN
高级聚合函数:
import numpy as np
import pandas as pd
data={'color':['write','write','green','green','yellow'],'item':['ben','an','qq','qq','ee'],'b':[4,5,6,7,8],'c':[6,7,8,9,0]}
frame=pd.DataFrame(data)
#高级聚合函数transform()先对color进行分组,列b进行sum操作,聚合操作的结果为b列,将b列添加到原DataFrame的每一行后
#得到新的DataFrame,最后输出新DataFrame的列b
res=frame['b'].groupby(frame['color'])
res1=res.transform(np.sum).add_prefix('add_')
print(res1)
#高级聚合函数apply()
res2=frame['b'].groupby([frame['color'],frame['item']])
res3=res2.apply(lambda x:x.max())
print(res3)
add_0 9
add_1 9
add_2 13
add_3 13
add_4 8
Name: b, dtype: int64
color item
green qq 7
write an 5
ben 4
yellow ee 8
Name: b, dtype: int64