pandas入门 基于莫烦python


# coding: utf-8

# In[2]:


import pandas as pd
import numpy as np


# In[3]:


#numpy基础
s = pd.Series([1,3,6,np.nan,44,1])


# In[4]:


s


# In[5]:


#dataFrame创建的方式
# 1 指定index clumn
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("abcd"))
df


# In[6]:


#2 直接用numpy的矩阵创建
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
df1


# In[7]:


# 3 用字段创建 每个key值作为一列
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'})
df2


# In[8]:


# df 的一些基本属性
df2.dtypes


# In[9]:


df2.index


# In[10]:


df2.columns


# In[11]:


df2.values


# In[13]:


df2.describe()


# In[14]:


df2.T


# In[15]:


#按键排序 和 按照值排序
df2.sort_index(axis=1, ascending=False)#axis 按照行的键值进行排序 , ascending逆序


# In[18]:


#按照值进行排序 
df2.sort_values(by='E')  #按照E所在列的增序列进行排序


# In[19]:


# 重点 numpy和padans的区别在于,numpy会指定行和列均是0,1,2,3.。。。,但pandas更像字典
# Series 数列 DataFrame 大字典
#DataFrame 有三种 1, 指定数据 index colunms 2,直接给numpy矩阵 3, 字典创建
# dtypes index columns values describe() df2.T
# sort_index(axis=1, ascending=False) df2.sort_values(by='B')


# In[20]:


#数据筛选


# In[23]:


dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
df


# In[24]:


df["A"]


# In[25]:


df[0:3]


# In[26]:


df['20130102':'20130104']


# In[27]:


# 常用来取值的loc 用key值
df.loc['20130102']
df.loc[:,['A','B']]


# In[33]:


# iloc 通过index进行选取
df.iloc[0:3,0:2]
# df.iloc[3:5,1:3]


# In[34]:


df.iloc[[1,3,4],0:2]


# In[36]:


df[df.A>8]


# In[37]:


#重点 loc iloc df[df.A>8]


# In[38]:


#pandas的值的设置


# In[42]:


dates = pd.date_range("20200131", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=list("ABCD"))
df


# In[44]:


df.iloc[1,2] = 200
df


# In[46]:


df.loc["20200202", "A"] = 100
df


# In[47]:


df.B[df.A>4] = 0#在df.A>0的地方,让df.B为0
df


# In[48]:


df["F"] = np.nan
df


# In[52]:


df["E"] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20200131',periods=6)) 
df


# In[ ]:


# 重点 知道如何取值就知道如何单独赋值 同时可以按行 添加 以及用Series进行赋值


# In[53]:


# pandas处理丢失数据


# In[55]:


dates = pd.date_range("20201031", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=list("ABCD"))
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
df


# In[57]:


df.dropna(
    axis=0,     # 0: 对行进行操作; 1: 对列进行操作
    how='any'   # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop 
    ) 


# In[58]:


df.fillna(value=0)


# In[59]:


df.isnull() 


# In[60]:


np.any(df.isnull()) == True  


# In[61]:


#重点 dropna fillna how isnull


# In[62]:


#导入导出数据  from to 格式统一
# data = pd.read_csv('student.csv')
# data.to_pickle('student.pickle')


# In[63]:


#pandas的concat
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])


# In[65]:


res = pd.concat([df1, df2, df3], axis=0)#列的方向和合并
res


# In[66]:


res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)#放弃index
res


# In[68]:


df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=0, join='outer', sort=True)#join outer
res


# In[70]:


res = pd.concat([df1, df2], axis=0, join='inner')#join inner
res


# In[72]:


df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])#依照`df1.index`进行横向合并
res


# In[73]:


res = pd.concat([df1, df2], axis=1)
res


# In[75]:


df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(df2, ignore_index=True)
res


# In[76]:


res = df1.append([df2, df3], ignore_index=True)
res


# In[78]:


res = df1.append(s1, ignore_index=True)
res


# In[79]:


#重点 concat axis ignore_index join join_axes append


# In[80]:


#pandas merge
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                             'A': ['A0', 'A1', 'A2', 'A3'],
                             'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                              'C': ['C0', 'C1', 'C2', 'C3'],
                              'D': ['D0', 'D1', 'D2', 'D3']})


# In[81]:


left


# In[82]:


right


# In[84]:


res = pd.merge(left, right, on="key")
res


# In[85]:


left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})


# In[88]:


#依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=["key1","key2"], how="inner")
res


# In[89]:


res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
res


# In[90]:


res = pd.merge(left, right, on=['key1', 'key2'], how='left')
res


# In[92]:


res = pd.merge(left, right, on=['key1', 'key2'], how='right')
res


# In[93]:


df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})


# In[95]:


res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
res #会在最后一列显示合并情况


# In[96]:


left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])


# In[99]:


res = pd.merge(left, right, left_index=True, right_index=True, how='outer')#依据左右资料集的index进行合并,how='outer',并打印出
res


# In[100]:


boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})


# In[101]:


res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
res


# In[102]:


#重点 merge on how left right inner outter suffixes indicator


# In[104]:


#pandas plot
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[105]:


data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data.cumsum()
data.plot()
plt.show()


# In[109]:


data = pd.DataFrame(
    np.random.randn(1000,4),
    index=np.arange(1000),
    columns=list("ABCD")
    )
data.cumsum()
data.plot()
plt.show()


# In[108]:


ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()


发布了17 篇原创文章 · 获赞 0 · 访问量 295

猜你喜欢

转载自blog.csdn.net/DropJing/article/details/104127597