知道pandas主要的数据结构以后就可以实际操作学习了,其实学习主要还是学习如何使用Series和Dataframe的属性和方法使用规则,多使用,多看官方文档就差不多了,下面开始学习pandas:
Series的创建
使用Python的列表创建Series:
import numpy as np
import pandas as pd
# 使用list创建
s1 = pd.Series([1,2,3,4]) # 可以发现索引index默认从0开始进行自动索引
s1
0 1
1 2
2 3
3 4
dtype: int64
# 值属性,可以方便查看Series的值
s1.values
array([1, 2, 3, 4], dtype=int64)
# 索引index属性,返回的是索引从开始到结束和间隔的值
s1.index
RangeIndex(start=0, stop=4, step=1)
# 使用numpy的数组进行创建
s2 = pd.Series(np.arange(10))
s2
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
# 通过字典进行创建
s3 = pd.Series({'1':1, '2':2, '3':3})
s3
1 1
2 2
3 3
dtype: int64
s3.values
array([1, 2, 3], dtype=int64)
s3.index
Index(['1', '2', '3'], dtype='object')
# 手动赋值索引
s4 = pd.Series([1,2,3,4], index=['A','B','C','D'])
s4
A 1
B 2
C 3
D 4
dtype: int64
s4.values
array([1, 2, 3, 4], dtype=int64)
s4.index
Index(['A', 'B', 'C', 'D'], dtype='object')
s4['A'] # 根据索引取值
1
s4[s4>1] # 根据值得范围取值
B 2
C 3
D 4
dtype: int64
s4.to_dict() # 把Series转换为字典输出,也就是说可以通过字典创建Series,也可以通过Series转换为字典
{'A': 1, 'B': 2, 'C': 3, 'D': 4}
s5 = pd.Series(s4.to_dict()) # 来回转
s5
A 1
B 2
C 3
D 4
dtype: int64
index_1 = ['A','B','C','D','E'] # 可单独把索引写出,再赋值给Series,同时多增加一个索引
s6 = pd.Series(s5, index=index_1)
s6 # 多增加的索引的值为NAN
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64
pd.isnull(s6) # 根据pd.isnall()判断Series的元素是否有空值,如果有返回Ture,反之False
A False
B False
C False
D False
E True
dtype: bool
pd.notnull(s6) # 类似的操作
A True
B True
C True
D True
E False
dtype: bool
s6.name = 'demo' # 给Series赋予名字
s6
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64
s6.index.name = 'demo_index' # 给索引起个名字
s6
demo_index
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64
s6.index
Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='demo_index')
pandas的Dataframe学习
pandas的Dataframe学习
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import webbrowser
link = 'https://www.tiobe.com//tiobe-index/'
webbrowser.open(link)
True
df = pd.read_clipboard() # 读取粘贴板的内容
df
Sep 2018 Sep.1 2017 Change Programming Language Ratings Change.1
0 0 1 1 NaN Java 17.436% +4.75% NaN NaN
1 1 2 2 NaN C 15.447% +8.06% NaN NaN
2 2 3 5 change Python 7.653% +4.67% NaN NaN
3 3 4 3 change C++ 7.394% +1.83% NaN NaN
4 4 5 8 change Visual Basic .NET 5.308% +3.33%
5 5 6 4 change C# 3.295% -1.48% NaN NaN
6 6 7 6 change PHP 2.775% +0.57% NaN NaN
7 7 8 7 change JavaScript 2.131% +0.11% NaN NaN
8 8 9 - change SQL 2.062% +2.06% NaN NaN
9 9 10 18 change Objective-C 1.509% +0.00% NaN NaN
type(df)
pandas.core.frame.DataFrame
df.columns # 读取索引,该操作作用很大,可以读取行的索引,以此可以通过该索引进行访问
Index(['Sep', '2018', 'Sep.1', '2017', 'Change', 'Programming', 'Language',
'Ratings', 'Change.1'],
dtype='object')
df.Ratings # 通过标签读取该列内容
0 17.436%
1 15.447%
2 7.653%
3 7.394%
4 5.308%
5 3.295%
6 2.775%
7 2.131%
8 2.062%
9 1.509%
Name: Ratings, dtype: object
# 从原先的dataframe中提取想要的数据,并赋给新的dataframe,该操作很有用
df_new = DataFrame(df, columns=['Sep 2019','Sep 2018', 'Change', 'Programming Language'])
# 同时增加新的标签,并自动赋值nan
df_new
Sep 2019 Sep 2018 Change Programming Language
0 NaN 1 NaN Java
1 NaN 2 NaN C
2 NaN 3 change Python
3 NaN 4 change C++
4 NaN 5 change Visual Basic .NET
5 NaN 6 change C#
6 NaN 7 change PHP
7 NaN 8 change JavaScript
8 NaN 9 change SQL
9 NaN 10 change Objective-C
df_new['Sep 2019'] = range(10) # 给新的列标签赋值
type(df['Sep 2018'])
pandas.core.series.Series
df_new
Sep 2019 Sep 2018 Change Programming Language
0 0 1 NaN Java
1 1 2 NaN C
2 2 3 change Python
3 3 4 change C++
4 4 5 change Visual Basic .NET
5 5 6 change C#
6 6 7 change PHP
7 7 8 change JavaScript
8 8 9 change SQL
9 9 10 change Objective-C
pandas_DataFrame常用读取各种格式的数据操作
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import webbrowser
link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True
df1 = pd.read_clipboard() # 读取粘贴板的内容
df1
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas None
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
df1.to_clipboard()
df1.to_csv('df1.csv',index=False) # 把内容写到名为df1.csv的文件中,把索引序号去除
!type df1.csv # window安装的jupyter使用该操作查看文件内容
Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
df2 = pd.read_csv('df1.csv') # 在通过pd.read_csv读出来
df2
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas NaN
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
df1.to_json()
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"to_pickle","11":"to_sql","12":"to_gbq"}}'
pd.read_json(df1.to_json())
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas None
df1.to_html('df1.html')
!dir
驱动器 C 中的卷是 OS
卷的序列号是 E28F-E81C
C:\Users\Admin\Documents\jupyter notebook\数据挖掘学习 的目录
2018/10/01 16:58 <DIR> .
2018/10/01 16:58 <DIR> ..
2018/10/01 16:34 <DIR> .ipynb_checkpoints
2018/10/01 16:47 508 df1.csv
2018/10/01 16:58 2,061 df1.html
2018/09/30 16:22 8,896 numpy_array.ipynb
2018/09/30 17:03 7,648 numpy_Array的input和output.ipynb
2018/09/30 21:33 154,056 numpy_example.ipynb
2018/10/01 13:11 20,844 numpy_example1.ipynb
2018/09/30 16:58 570 numpy_two_file.npz
2018/09/30 16:52 168 numpy_wb_array.npy
2018/09/30 16:31 13,759 numpy_数组与矩阵的运算.ipynb
2018/10/01 16:01 16,335 pandas_Dataframe.ipynb
2018/10/01 16:57 19,694 pandas_DataFrame_IO.ipynb
2018/10/01 15:42 9,681 pandas_series.ipynb
2018/09/30 16:45 195 x.pkl
2018/10/01 16:34 10,325 深入理解Series和DataFrame.ipynb
14 个文件 264,740 字节
3 个目录 37,539,487,744 可用字节
df1.to_excel('df1.xlsx',index=False)
还有很多其他的格式的读写操作,这里不一一列举了
Series和Dataframe的排序
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
# Series的排序
s1 = Series(np.random.randn(10))
s1
0 -0.424930
1 -0.279580
2 -0.763344
3 -0.018883
4 -0.217693
5 -1.367801
6 -1.402714
7 -1.125427
8 -0.560325
9 0.037897
dtype: float64
s1.values
array([-0.42493031, -0.27958011, -0.76334433, -0.01888319, -0.21769267,
-1.36780083, -1.40271395, -1.1254272 , -0.56032516, 0.03789725])
s1.index
RangeIndex(start=0, stop=10, step=1)
s2 = s1.sort_values(ascending=False) # 通过值排序,默认是升序
s2
9 0.037897
3 -0.018883
4 -0.217693
1 -0.279580
0 -0.424930
8 -0.560325
2 -0.763344
7 -1.125427
5 -1.367801
6 -1.402714
dtype: float64
s2.sort_index(ascending=False) # 通过index排序,默认升序
9 0.037897
8 -0.560325
7 -1.125427
6 -1.402714
5 -1.367801
4 -0.217693
3 -0.018883
2 -0.763344
1 -0.279580
0 -0.424930
dtype: float64
# DataFrame的排序
df1 = DataFrame(np.random.randn(40).reshape(8,5),columns=['A','B','C','D','E'])
df1
A B C D E
0 -0.971041 -1.006900 1.824719 0.362461 0.010793
1 -1.202866 -0.520920 0.603463 -2.222645 0.085995
2 0.803201 1.664502 0.866629 -3.014892 -0.026222
3 -2.737011 -0.844403 -0.565923 0.956517 2.366085
4 0.183618 -0.731790 1.449652 0.440694 0.749678
5 -0.852958 -2.110449 -0.872135 2.816473 0.402317
6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359
7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963
# 可以通colums的值进行排序也可以按照index排序
df1['A'].sort_values() # 只调整一列series,无法调整根据要求调整整个dataframe
3 -2.737011
1 -1.202866
0 -0.971041
5 -0.852958
7 -0.620219
6 -0.064767
4 0.183618
2 0.803201
Name: A, dtype: float64
df2 = df1.sort_values('A') # 这次可以整体排序
df2
A B C D E
3 -2.737011 -0.844403 -0.565923 0.956517 2.366085
1 -1.202866 -0.520920 0.603463 -2.222645 0.085995
0 -0.971041 -1.006900 1.824719 0.362461 0.010793
5 -0.852958 -2.110449 -0.872135 2.816473 0.402317
7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963
6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359
4 0.183618 -0.731790 1.449652 0.440694 0.749678
2 0.803201 1.664502 0.866629 -3.014892 -0.026222
df2.sort_index() # 应该和df1相同
A B C D E
0 -0.971041 -1.006900 1.824719 0.362461 0.010793
1 -1.202866 -0.520920 0.603463 -2.222645 0.085995
2 0.803201 1.664502 0.866629 -3.014892 -0.026222
3 -2.737011 -0.844403 -0.565923 0.956517 2.366085
4 0.183618 -0.731790 1.449652 0.440694 0.749678
5 -0.852958 -2.110449 -0.872135 2.816473 0.402317
6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359
7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963
df1
A B C D E
0 -0.971041 -1.006900 1.824719 0.362461 0.010793
1 -1.202866 -0.520920 0.603463 -2.222645 0.085995
2 0.803201 1.664502 0.866629 -3.014892 -0.026222
3 -2.737011 -0.844403 -0.565923 0.956517 2.366085
4 0.183618 -0.731790 1.449652 0.440694 0.749678
5 -0.852958 -2.110449 -0.872135 2.816473 0.402317
6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359
7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963