导入numpy和pandas
import numpy as np
import pandas as pd
pandas创建列表
1.1写入固定值创建一维列表
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
1.2连续的时间序列
dates = pd.date_range('20160101',periods=6)
print(dates)
1.3按照默认的从0开始的数字创建矩阵
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
1.4按照给定的行列名称生成矩阵
dates = pd.date_range('20160101',periods=6)
a = np.array(['a','b','c','d'])
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=a)
print(df)
1.5按照字典的方式给每一列单独赋值
df2 = pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'})
print(df2)
pandas矩阵操作
2.1矩阵转置
print(df2.T)
2.2矩阵按行列索引排序
print(df2.sort_index(axis=1, ascending=False))
print(df2.sort_index(axis=0, ascending=False))
print(df2.sort_index(axis=1, ascending=True))
print(df2.sort_index(axis=0, ascending=True))
2.3矩阵按行或列数值排序
print(df2.sort_values(by='E'))
pandas输出矩阵信息
3.1打印每一列的数据类型
print(df2.dtypes)
3.2打印每一行的序号
print(df2.index)
3.3打印每一列的序号
print(df2.columns)
3.4打印描述
df2.describe()
选择数据进行输出
4.1带行列索引输出矩阵
print(df2)
4.2只打印矩阵的值
print(df2.values)
4.3按索引打印一整行或一整列数据
print(df['A'])
print(df.A)
4.4按索引进行切片打印
print(df[0:3])
print(df['20130102':'20130104'])
4.5按标签进行选择或切片打印
print(df.loc['20130102'])
print(df.loc[:,['A','B']])
print(df.loc['20130102',['A','B']])
4.6按位置进行选择或切片
print(df.iloc[3,1])
print(df.iloc[3:5,1:3])
print(df.iloc[[1,3,5],1:3])
4.7按标签和位置进行混合选择数据
print(df.ix[:3,['A','C']])
4.8按逻辑进行选择数据
print(df[df.A>8])
pandas设置值
5.1位置直接赋值
df.iloc[2,2] = 1111
5.2按标签直接赋值
df.loc['20130101','B'] = 2222
5.3按逻辑进行一整行赋值
df[df.A>4] = 0
5.4按逻辑进行单个元素直接赋值
df.A[df.A>4] = 0
pandas添加新数据
6.1添加一列全部为空
df['F'] = np.nan
6.2添加有数据的一列
df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101',periods=6))
pandas处理丢失数据
7.1丢弃空值
df.dropna(
axis=0,
how='any'
)
7.2空数据默认处理成某个数据
df.fillna(value=0)
7.3判断空数据
df.isnull()
7.4判断数据中是不是有缺失数据
np.any(df.isnull()) == True
7.5判断数据中是不是全是缺失数据
np.all(df.isnull()) == True
pandas数据导入和导出
8.1数据导入
data = pd.read_csv('student.csv')
8.2数据保存
data.to_pickle('student.pickle')
pandas按索引合并(concat)
9.1纵向合并pandas数据
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
res = pd.concat([df1, df2, df3], axis=0)
print(res)
9.2横向合并pandas
res = pd.concat([df1, df2, df3], axis=1)
print(res)
9.3纵向合并重新定义索引
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
9.4横向合并重新定义索引
res = pd.concat([df1, df2, df3], axis=1, ignore_index=True)
print(res)
9.5纵向外合并
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=0, join='outer',sort = True)
res = pd.concat([df1, df2], axis=0, join='outer', ignore_index=True,sort = True)
9.6纵向内合并
res = pd.concat([df1, df2], axis=0, join='inner',sort = True)
res = pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
9.7参照其中一个数据进行合并
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
9.8用append合并数据
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2, df3], ignore_index=True)
res = df1.append(s1, ignore_index=True)
pandas 按索引和数值合并(merge)
10.1按照单个列索引进行合并
import pandas as pd
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right, on='key')
print(res)
A B key C D
10.2按照多个列索引进行合并
import pandas as pd
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
10.3按照列索引进行合并并显示合并记录
import pandas as pd
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
10.4按照行索引合并
import pandas as pd
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
10.5把相同索引的数据分开显示
import pandas as pd
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
画图
11.1 引入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
11.2画线性图
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
11.3一张图画多个线性图
data = pd.DataFrame(
np.random.randn(1000,4),
index=np.arange(1000),
columns=list("ABCD")
)
data = data.cumsum()
data.plot()
plt.show()
11.4画单张散点图
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
11.5将之下这个 data 画在上一个 ax 上面
ab = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ab)
plt.show()