# coding: utf-8
# In[2]:
import pandas as pd
import numpy as np
# In[3]:
#numpy基础
s = pd.Series([1,3,6,np.nan,44,1])
# In[4]:
s
# In[5]:
#dataFrame创建的方式
# 1 指定index clumn
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("abcd"))
df
# In[6]:
#2 直接用numpy的矩阵创建
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
df1
# In[7]:
# 3 用字段创建 每个key值作为一列
df2 = pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'})
df2
# In[8]:
# df 的一些基本属性
df2.dtypes
# In[9]:
df2.index
# In[10]:
df2.columns
# In[11]:
df2.values
# In[13]:
df2.describe()
# In[14]:
df2.T
# In[15]:
#按键排序 和 按照值排序
df2.sort_index(axis=1, ascending=False)#axis 按照行的键值进行排序 , ascending逆序
# In[18]:
#按照值进行排序
df2.sort_values(by='E') #按照E所在列的增序列进行排序
# In[19]:
# 重点 numpy和padans的区别在于,numpy会指定行和列均是0,1,2,3.。。。,但pandas更像字典
# Series 数列 DataFrame 大字典
#DataFrame 有三种 1, 指定数据 index colunms 2,直接给numpy矩阵 3, 字典创建
# dtypes index columns values describe() df2.T
# sort_index(axis=1, ascending=False) df2.sort_values(by='B')
# In[20]:
#数据筛选
# In[23]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
df
# In[24]:
df["A"]
# In[25]:
df[0:3]
# In[26]:
df['20130102':'20130104']
# In[27]:
# 常用来取值的loc 用key值
df.loc['20130102']
df.loc[:,['A','B']]
# In[33]:
# iloc 通过index进行选取
df.iloc[0:3,0:2]
# df.iloc[3:5,1:3]
# In[34]:
df.iloc[[1,3,4],0:2]
# In[36]:
df[df.A>8]
# In[37]:
#重点 loc iloc df[df.A>8]
# In[38]:
#pandas的值的设置
# In[42]:
dates = pd.date_range("20200131", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=list("ABCD"))
df
# In[44]:
df.iloc[1,2] = 200
df
# In[46]:
df.loc["20200202", "A"] = 100
df
# In[47]:
df.B[df.A>4] = 0#在df.A>0的地方,让df.B为0
df
# In[48]:
df["F"] = np.nan
df
# In[52]:
df["E"] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20200131',periods=6))
df
# In[ ]:
# 重点 知道如何取值就知道如何单独赋值 同时可以按行 添加 以及用Series进行赋值
# In[53]:
# pandas处理丢失数据
# In[55]:
dates = pd.date_range("20201031", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=list("ABCD"))
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
df
# In[57]:
df.dropna(
axis=0, # 0: 对行进行操作; 1: 对列进行操作
how='any' # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
)
# In[58]:
df.fillna(value=0)
# In[59]:
df.isnull()
# In[60]:
np.any(df.isnull()) == True
# In[61]:
#重点 dropna fillna how isnull
# In[62]:
#导入导出数据 from to 格式统一
# data = pd.read_csv('student.csv')
# data.to_pickle('student.pickle')
# In[63]:
#pandas的concat
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
# In[65]:
res = pd.concat([df1, df2, df3], axis=0)#列的方向和合并
res
# In[66]:
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)#放弃index
res
# In[68]:
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=0, join='outer', sort=True)#join outer
res
# In[70]:
res = pd.concat([df1, df2], axis=0, join='inner')#join inner
res
# In[72]:
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])#依照`df1.index`进行横向合并
res
# In[73]:
res = pd.concat([df1, df2], axis=1)
res
# In[75]:
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(df2, ignore_index=True)
res
# In[76]:
res = df1.append([df2, df3], ignore_index=True)
res
# In[78]:
res = df1.append(s1, ignore_index=True)
res
# In[79]:
#重点 concat axis ignore_index join join_axes append
# In[80]:
#pandas merge
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
# In[81]:
left
# In[82]:
right
# In[84]:
res = pd.merge(left, right, on="key")
res
# In[85]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
# In[88]:
#依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=["key1","key2"], how="inner")
res
# In[89]:
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
res
# In[90]:
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
res
# In[92]:
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
res
# In[93]:
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
# In[95]:
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
res #会在最后一列显示合并情况
# In[96]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
# In[99]:
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')#依据左右资料集的index进行合并,how='outer',并打印出
res
# In[100]:
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
# In[101]:
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
res
# In[102]:
#重点 merge on how left right inner outter suffixes indicator
# In[104]:
#pandas plot
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
# In[105]:
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data.cumsum()
data.plot()
plt.show()
# In[109]:
data = pd.DataFrame(
np.random.randn(1000,4),
index=np.arange(1000),
columns=list("ABCD")
)
data.cumsum()
data.plot()
plt.show()
# In[108]:
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()
pandas入门 基于莫烦python
猜你喜欢
转载自blog.csdn.net/DropJing/article/details/104127597
今日推荐
周排行