pandas是基于numpy的库,也是python中对数据处理非常常用的库。
- Series
- DataFrame
- Index
1. Series
>>> import pandas as pd
>>> import numpy as np
>>> s = pd.Series([7,'beijing',2.1,2,'happy'])
>>> s
0 7
1 beijing
2 2.1
3 2
4 happy
dtype: object
>>> s = pd.Series([7,'beijing',2.1,2,'happy'],index = ['A','B','C','D','E']) #修改index
>>> s
A 7
B beijing
C 2.1
D 2
E happy
dtype: object
>>> type(s)
<class 'pandas.core.series.Series'>
>>> cities = {'beijing':55000,'shanghai':60000,'shenzhen':40000,'guangzhou':25000}
>>> cities
{'beijing': 55000, 'shanghai': 60000, 'shenzhen': 40000, 'guangzhou': 25000}
>>> apts = pd.Series(cities) #用dict来定义一个Series,Series本身就是一个key value pair
>>> apts
beijing 55000
guangzhou 25000
shanghai 60000
shenzhen 40000
dtype: int64
>>> apts[['beijing','shenzhen']]
beijing 55000
shenzhen 40000
dtype: int64
>>> apts[apts<50000]
guangzhou 25000
shenzhen 40000
dtype: int64
>>> 'beijing' in apts #查看元素是否在Series中
True
>>> apts[apts.isnull()] #查看value为null的元素
Series([], dtype: int64)
>>> apts[apts.notnull()] #查看value非null的元素
beijing 55000
guangzhou 25000
shanghai 60000
shenzhen 40000
dtype: int64
2. DataFrame
一个DataFrame就是一张表格,Series表示的是一维数组,DataFrame则是一个二维数组
>>> import pandas as pd >>> import numpy as np >>> data = {'cities':['beijing','shanghai','guangzhou','shenzhen'],'years':[2014,2015,2016,2017],'population':[1000,2000,3000,4000]} >>> type(pd.DataFrame(data)) <class 'pandas.core.frame.DataFrame'> >>> pd.DataFrame(data) #DataFrame的初始化 cities population years 0 beijing 1000 2014 1 shanghai 2000 2015 2 guangzhou 3000 2016 3 shenzhen 4000 2017 >>> pd.DataFrame(data,columns=['years','cities','population'])#指定列的顺序 years cities population 0 2014 beijing 1000 1 2015 shanghai 2000 2 2016 guangzhou 3000 3 2017 shenzhen 4000 >>> pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue'])#修改行的名称 years cities population one 2014 beijing 1000 two 2015 shanghai 2000 three 2016 guangzhou 3000 foue 2017 shenzhen 4000 >>> frame2 = pd.DataFrame(data,columns=['years','cities','population'],index=['one','two','three','foue']) >>> frame2 years cities population one 2014 beijing 1000 two 2015 shanghai 2000 three 2016 guangzhou 3000 foue 2017 shenzhen 4000 >>> frame2['cities']#取出某一列 one beijing two shanghai three guangzhou foue shenzhen Name: cities, dtype: object >>> frame2.cities one beijing two shanghai three guangzhou foue shenzhen Name: cities, dtype: object >>> frame2.ix['three']#取出某一行 years 2016 cities guangzhou population 3000 Name: three, dtype: object >>> frame2.ix[2] years 2016 cities guangzhou population 3000 Name: three, dtype: object >>> frame2['cities']['one'] = 'zhuhai'#修改某个元素 Warning (from warnings module): File "__main__", line 1 SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy >>> frame2 years cities population one 2014 zhuhai 1000 two 2015 shanghai 2000 three 2016 guangzhou 3000 foue 2017 shenzhen 4000 >>> frame2['population'] = 1#修改一整列 >>> frame2 years cities population one 2014 zhuhai 1 two 2015 shanghai 1 three 2016 guangzhou 1 foue 2017 shenzhen 1 >>> frame2.ix['five'] = 1#修改一整行 >>> frame2 years cities population one 2014 zhuhai 1 two 2015 shanghai 1 three 2016 guangzhou 1 foue 2017 shenzhen 1 five 1 1 1 >>> frame2.ix['one'] = 2 >>> frame2 years cities population one 2 2 2 two 2015 shanghai 1 three 2016 guangzhou 1 foue 2017 shenzhen 1 five 1 1 1 >>> frame2.years = np.arange(5)#用生成器来修改一列 >>> frame2 years cities population one 0 2 2 two 1 shanghai 1 three 2 guangzhou 1 foue 3 shenzhen 1 five 4 1 1 >>> val = pd.Series([200,300,500],index=['two','three','five']) >>> frame2['population'] = val#用一个Series来修改某一列 >>> frame2 years cities population one 0 2 NaN two 1 shanghai 200.0 three 2 guangzhou 300.0 foue 3 shenzhen NaN five 4 1 500.0 >>> frame2.columns Index(['years', 'cities', 'population'], dtype='object') >>> frame2.index Index(['one', 'two', 'three', 'foue', 'five'], dtype='object') >>> frame2.T#转置 one two three foue five years 0 1 2 3 4 cities 2 shanghai guangzhou shenzhen 1 population NaN 200 300 NaN 500 >>> frame2['cities'][1:2]#用切片来取元素 two shanghai Name: cities, dtype: object >>>
3. Index
关于index的一些操作
>>> import pandas as pd >>> import numpy as np >>> obj = pd.Series(range(3)) >>> obj 0 0 1 1 2 2 dtype: int64 >>> obj = pd.Series(range(3),index=['a','b','c']) >>> obj a 0 b 1 c 2 dtype: int64 >>> obj[[0,2]] a 0 c 2 dtype: int64 >>> obj[0:2] a 0 b 1 dtype: int64 >>> obj['a':'c'] a 0 b 1 c 2 dtype: int64 >>> obj['a':'c'] = 3 >>> obj a 3 b 3 c 3 dtype: int64 >>> frame = pd.DataFrame(np.arange(9).reshape(3,3),index = ['a','b','c'],columns=['beijing','shanghai','guangzhou']) >>> frame beijing shanghai guangzhou a 0 1 2 b 3 4 5 c 6 7 8 >>> frame.ix['a':'c'] beijing shanghai guangzhou a 0 1 2 b 3 4 5 c 6 7 8 >>> frame.ix[['a','c'],['beijing','guangzhou']] beijing guangzhou a 0 2 c 6 8 >>> frame.ix[:,'beijing':'guangzhou'] beijing shanghai guangzhou a 0 1 2 b 3 4 5 c 6 7 8 >>> frame.reindex(['e','f','g','h']) beijing shanghai guangzhou e NaN NaN NaN f NaN NaN NaN g NaN NaN NaN h NaN NaN NaN >>> frame beijing shanghai guangzhou a 0 1 2 b 3 4 5 c 6 7 8 >>> frame.drop('a') beijing shanghai guangzhou b 3 4 5 c 6 7 8 >>> frame beijing shanghai guangzhou a 0 1 2 b 3 4 5 c 6 7 8 >>> frame = frame.drop('a') >>> frame beijing shanghai guangzhou b 3 4 5 c 6 7 8 >>> data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','c','d','d'],[1,2,3,1,2,1,2,3,1,2]]) >>> data a 1 -0.060544 2 -1.680403 3 0.408582 b 1 1.001766 2 1.320155 c 1 -1.125726 2 1.508404 3 0.640139 d 1 0.824988 2 0.148888 dtype: float64 >>> data.index MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 2, 2, 2, 3, 3], [0, 1, 2, 0, 1, 0, 1, 2, 0, 1]]) >>> data['b':'d'] b 1 1.001766 2 1.320155 c 1 -1.125726 2 1.508404 3 0.640139 d 1 0.824988 2 0.148888 dtype: float64 >>> data[1:4] a 2 -1.680403 3 0.408582 b 1 1.001766 dtype: float64 >>> data.unstack() 1 2 3 a -0.060544 -1.680403 0.408582 b 1.001766 1.320155 NaN c -1.125726 1.508404 0.640139 d 0.824988 0.148888 NaN >>> data.unstack().stack() a 1 -0.060544 2 -1.680403 3 0.408582 b 1 1.001766 2 1.320155 c 1 -1.125726 2 1.508404 3 0.640139 d 1 0.824988 2 0.148888 dtype: float64 >>>