本文链接： https://blog.csdn.net/jjsjsjjdj/article/details/102769687

实训1

读取并查看P2P网络贷款数据主表的基本信息

import numpy as np
import pandas as pd


#1.基本信息
def basic_information(detail):
    print("1.属性列表为:",detail.columns)
    print("2.数据的维度为：",detail.ndim)
    print("3.数据矩阵的格式",detail.shape)
    print("4.数据的具体信息")
    detail.info()
    print("\n")
   
    
#2.剔除整列为空或者取值相同的列

#-----------------------------------方法1---------------------------------------------------------------
def dropNullStd(data):
    beforelen=data.shape[1]
    
    colisNull=data.describe().loc["count"] #每一列值不为空的总数
    
    for i in range(len(colisNull)):      #遍历所有的列的count
        if colisNull[i]==0 :              #筛选出计数为0，表示为空，将其删除
            data.drop(colisNull.index[i],axis=1,inplace=True)  

    
    stdisZero=data.describe().loc["std"]  #std表示每一列的标准差
    
    for i in range(len(stdisZero)):      
        if stdisZero[i]==0:
            data.drop(stdisZero.index[i],axis=1,inplace=True) 
            
    afterlen=data.shape[1]  # shape[0]表示行  shape[1]表示列
  
    print('\n剔除的列的数目为',beforelen-afterlen)  
    print("剔除数据的形状为",data.shape)  #剔除后的列表


#对drop参数的理解 
#1.colisNull.index[i]表示删除表的行号或列好
#2.axis=0表示操作对象是行，axis=1表示操作对象是列
#3.inplace=True表示直接在原表上操作，False表示重新创建一个新表进行操作

#--------------------------方法2--------------------------------------------------------    
def dropNullStd2(master):
    xlist=[]      
    #找出不符合要求的项，将它们装入列表xlist
    for i in master.columns:   #遍历每一列
        try:
            if master[i].describe()["std"]==0: #标准差为0时，表示所有的值相同
                xlist.append(i)    
        except TypeError as e:       
            if master[i].destribe()["unique"]==0: #去除为空的列
                xlist.append(i)

        except TypeError as e:   
            pass
        except KeyError as e2:
            pass

    print("要删除的列为:",xlist)
    
    #将他们遍历，逐一删除
    try:
        for i in xlist:
            master.drop(i,axis=1,inplace=True)
    except ValueError as e:
        pass    
    
    print("剔除数据的形状为",master.shape)
    
#----------------------------------------------------------------------    
    
filepath1="G:/大数据实验数据库/4.Python数据分析与应用/第4章/实训数据/Training_Master.csv"

data=pd.read_csv(filepath1,sep=",",encoding="GBK")
basic_information(data)

print("5.数据idx和UserInfo_1两列的描述性统计:\n",data[["Idx","UserInfo_1"]].describe())
#dropNullStd(data)
dropNullStd2(data)

在这里插入图片描述

实训2

提取用户信息更新表和登录信息表的时间信息

filepath2="G:/大数据实验数据库/4.Python数据分析与应用/第4章/实训数据/Training_LogInfo.csv"
filepath3="G:/大数据实验数据库/4.Python数据分析与应用/第4章/实训数据/Training_Userupdate.csv"

#1.将用户信息转换为时间字符串
def fun1(LogInfo):
    print("进行转换前,Listinginfo1的类型为：",LogInfo["Listinginfo1"].dtypes)
    LogInfo["Listinginfo1"]=pd.to_datetime(LogInfo["Listinginfo1"])
    print("进行转换后,Listinginfo1的类型为：",LogInfo["Listinginfo1"].dtypes)
    print("\n")

#2.用year、month、week方法提取用户时间信息
def fun2(LogInfo):
    year=[i.year for i in LogInfo["Listinginfo1"]]
    month=[i.month for i in LogInfo["Listinginfo1"]]
    day=[i.day for i in LogInfo["Listinginfo1"]]
    week=[i.week for i in LogInfo["Listinginfo1"]]
    weekday=[i.weekday() for i in LogInfo["Listinginfo1"]]
    weekname=[i.weekday_name for i in LogInfo["Listinginfo1"]]

    print("登录数据表，前5条数据的年信息为",year[:5])
    print("登录数据表，前5条数据的月信息为",month[:5])
    print("登录数据表，前5条数据的日信息为",day[:5])
    print("登录数据表，前5条数据的周信息为",week[:5])
    print("登录数据表，前5条数据的星期信息为",weekday[:5])
    print("登录数据表，前5条数据的星期名信息为",weekname[:5])
    print("\n")

LogInfo=pd.read_csv(filepath2,sep=",",encoding="GBK")
fun1(LogInfo)
fun2(LogInfo)
Userupdate=pd.read_csv(filepath3,sep=",",encoding="GBK")
fun1(Userupdate)
fun2(Userupdate)


#计算时间差
timeDelta=Userupdate["Listinginfo1"]-LogInfo["Listinginfo1"]

print("以天为单位",timeDelta[:5].values/np.timedelta64(1, 'D'))
print("以小时为单位",timeDelta[:5].values/np.timedelta64(1, 'h'))
print("以分钟为单位",timeDelta[:5].values/np.timedelta64(1, 'm'))

在这里插入图片描述

大数据技术（第4节实验课-----pandas处理表格信息）

实训1

读取并查看P2P网络贷款数据主表的基本信息

实训2

提取用户信息更新表和登录信息表的时间信息

猜你喜欢