2.1 pandas安装
Pandas可以通过Anaconda来下的命令来安装,安装命令如下:
conda install pandas
Pandas也可以通过PyPi的pip命令安装:
pip install pandas
2.2 pandas介绍
当在处理表格数据,比如在电子表格或数据库中的数据,Pandas是一个比较适合工具。
Pandas将帮助你explor,clean和process你的数据。在Pandas中,表格数据被称作DataFrame。
在pandas中,基础的统计运算比较容易计算(mean(平均值)、median(中位数)、min、max、counts…)
2.4.3
创建Series
# -*- coding: UTF-8 -*-
#通常我们引入以下两个包
import numpy as np
import pandas as pd
# 通过传递进入一个列表的值的方式来创建Series
s = pd.Series([1,3,5,np.nan,6,8])
print(s)
输出结果为:
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
创建DataFrame
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
print(dates)
# 随机生成一个6行4列的值
print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
运行结果为:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
[[-0.07608087 0.58558893 -0.21592845 0.12784177]
[ 0.247535 0.53836748 -0.70284328 1.54837594]
[ 1.64819574 -0.50206321 -0.35793019 0.68934274]
[ 0.22650068 0.12538524 0.40732857 -1.06068599]
[-2.23717262 0.33738822 -0.50604412 1.12995536]
[-0.08299078 -0.63831866 0.01021688 -0.01220394]]
A B C D
2013-01-01 0.869725 0.839124 -0.762421 -0.093006
2013-01-02 0.868084 -0.204707 -0.328201 -0.608614
2013-01-03 0.388475 0.954867 1.766084 -0.675314
2013-01-04 0.813794 -0.603895 -1.658760 -0.630126
2013-01-05 -0.929438 0.136639 0.621816 0.379015
2013-01-06 -1.339556 -0.729281 -0.036169 0.924692
再如:
import numpy as np
import pandas as pd
df2 = pd.DataFrame({
'A':1.,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3] * 4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'
})
print(df2)
print("-------------")
print(df2.dtypes)
输出结果为:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
-------------
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
查看DataFrame中的数据 head、tail、 index、columns
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
print(dates)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
# 查看DataFrame的前3条数据,如果参数不传的话默认显示5条
print(df.head(3))
#查看并显示DataFrame后面的数据(默认显示后面的5条)
print(df.tail(4))
# 显示DataFrame的索引值
print(df.index)
# 显示列的标题值
print(df.columns)
输出结果:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
A B C D
2013-01-01 -0.727105 -0.133146 1.562698 0.643809
2013-01-02 -0.607909 -1.682421 0.840893 0.884477
2013-01-03 1.291623 0.112634 -1.300335 -2.282469
2013-01-04 -1.133477 0.079521 1.591085 1.968505
2013-01-05 0.253467 1.530087 -2.272846 1.320857
2013-01-06 -0.460437 -1.982561 0.231264 -1.100951
A B C D
2013-01-01 -0.727105 -0.133146 1.562698 0.643809
2013-01-02 -0.607909 -1.682421 0.840893 0.884477
2013-01-03 1.291623 0.112634 -1.300335 -2.282469
A B C D
2013-01-03 1.291623 0.112634 -1.300335 -2.282469
2013-01-04 -1.133477 0.079521 1.591085 1.968505
2013-01-05 0.253467 1.530087 -2.272846 1.320857
2013-01-06 -0.460437 -1.982561 0.231264 -1.100951
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
to_numpy()
DataFrame.to_numpy()函数将DataFrame中的值转化为NumPy的数组,在这个过程中如果DataFrame中的每列的数据类型不一样,这个转化将比较慢。如果里面的类型都是一样的,这样的转化比较快。
案例如下:
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
print(dates)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("-------------------------")
print(df.to_numpy())
输出结果:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
A B C D
2013-01-01 -1.386452 1.248885 -0.571603 0.561053
2013-01-02 0.000677 2.068503 -0.716411 -1.608811
2013-01-03 -0.992540 0.145069 0.499039 -0.226581
2013-01-04 0.641727 -0.575535 -0.986220 0.511012
2013-01-05 0.958642 -0.151898 0.028765 1.911871
2013-01-06 1.416480 1.535678 -0.708689 -0.146528
-------------------------
[[-1.38645213e+00 1.24888452e+00 -5.71602643e-01 5.61052513e-01]
[ 6.77031253e-04 2.06850329e+00 -7.16410621e-01 -1.60881096e+00]
[-9.92539619e-01 1.45069051e-01 4.99039121e-01 -2.26580685e-01]
[ 6.41726963e-01 -5.75534913e-01 -9.86220354e-01 5.11012037e-01]
[ 9.58642437e-01 -1.51897711e-01 2.87646756e-02 1.91187122e+00]
[ 1.41647960e+00 1.53567763e+00 -7.08688613e-01 -1.46527696e-01]]
describe()快速统计汇总
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
print(dates)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("-------------------------")
print(df.describe())
输出结果:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
A B C D
2013-01-01 -0.122144 0.502784 0.864083 -0.623890
2013-01-02 0.734360 2.029852 1.143485 0.229144
2013-01-03 -0.961763 0.685285 -0.769449 -1.356750
2013-01-04 -0.208984 1.350035 -1.097327 1.212215
2013-01-05 0.528868 -3.289768 -0.645706 1.026945
2013-01-06 0.490666 -0.653767 0.628475 -0.654120
-------------------------
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.076834 0.104070 0.020593 -0.027743
std 0.633704 1.889460 0.965375 1.022727
min -0.961763 -3.289768 -1.097327 -1.356750
25% -0.187274 -0.364630 -0.738513 -0.646562
50% 0.184261 0.594034 -0.008615 -0.197373
75% 0.519317 1.183847 0.805181 0.827495
max 0.734360 2.029852 1.143485 1.212215
置换数据
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("--------置换的结果是:将原来的index和Column置换了----------")
print(df.T)
输出结果:
A B C D
2013-01-01 -0.483555 0.700367 2.140891 -0.735908
2013-01-02 -0.148087 1.923155 1.288311 -0.214712
2013-01-03 1.691412 1.468032 1.202893 0.741419
2013-01-04 -1.368299 -0.068072 -0.277387 0.012199
2013-01-05 0.859380 -0.869234 -0.163565 0.640557
2013-01-06 -1.156005 -0.311887 -0.015274 0.374591
--------置换的结果是:将原来的index和Column置换了----------
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A -0.483555 -0.148087 1.691412 -1.368299 0.859380 -1.156005
B 0.700367 1.923155 1.468032 -0.068072 -0.869234 -0.311887
C 2.140891 1.288311 1.202893 -0.277387 -0.163565 -0.015274
D -0.735908 -0.214712 0.741419 0.012199 0.640557 0.374591
按照轴的值排序sort_index
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("--------结果是:发现编程了D,C,B,A列了,axis=1列序号值排序,axis=0行序号值排序----------")
print(df.sort_index(axis=1,ascending=False))
运行结果:
A B C D
2013-01-01 -0.449586 -1.359781 -0.831226 -0.347369
2013-01-02 0.744838 -0.244150 0.360123 -0.296774
2013-01-03 -1.079490 -0.919209 -0.229262 -0.780102
2013-01-04 0.848343 -1.657268 0.077846 0.184712
2013-01-05 2.230455 -0.073798 -0.393167 -2.292176
2013-01-06 0.153200 0.881303 -1.247231 0.689450
--------结果是:发现编程了D,C,B,A列了,axis=1列序号值排序,axis=0行序号值排序----------
D C B A
2013-01-01 -0.347369 -0.831226 -1.359781 -0.449586
2013-01-02 -0.296774 0.360123 -0.244150 0.744838
2013-01-03 -0.780102 -0.229262 -0.919209 -1.079490
2013-01-04 0.184712 0.077846 -1.657268 0.848343
2013-01-05 -2.292176 -0.393167 -0.073798 2.230455
2013-01-06 0.689450 -1.247231 0.881303 0.153200
按照指定列的值进行排序sort_values
import numpy as np
import pandas as pd
# 通过设置开始时间,并设置间隔了多少月
dates = pd.date_range('20130101',periods=6)
# 随机生成一个6行4列的值
# print(np.random.randn(6,4))
# 设置dates为行,ABCD为列的标题值,np.random.randn(6, 4)为行和列中的值
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print("--------结果是:B这一列的值降序排列了----------")
print(df.sort_values(by='B',ascending=False))
输出结果:
A B C D
2013-01-01 -0.372298 1.488387 0.397128 -1.079578
2013-01-02 0.186005 -0.140236 -0.635494 0.259721
2013-01-03 -2.666026 1.843873 -1.106027 0.004454
2013-01-04 0.797870 -0.244366 -0.700616 -1.094778
2013-01-05 -2.361092 -0.272000 -1.099560 1.518242
2013-01-06 -0.294348 0.616753 2.184161 -1.132596
--------结果是:B这一列的值降序排列了----------
A B C D
2013-01-03 -2.666026 1.843873 -1.106027 0.004454
2013-01-01 -0.372298 1.488387 0.397128 -1.079578
2013-01-06 -0.294348 0.616753 2.184161 -1.132596
2013-01-02 0.186005 -0.140236 -0.635494 0.259721
2013-01-04 0.797870 -0.244366 -0.700616 -1.094778
2013-01-05 -2.361092 -0.272000 -1.099560 1.518242