版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/X_dmword/article/details/88652814
1、数据的导入查看
import pandas as pd f = open("E:/test.csv", encoding="utf-8") content = pd.read_csv(f,nrows=5) ##打印(读取)前5行 print(content) ##等价 print(content[0:5])左闭右开,输出0-4序号行
import pandas as pd # content = pd.read_csv("E:/test.csv",nrows=2) ##打印(读取)前5行 #print(content) data = pd.read_excel("E:/test_data.xlsx",nrows=3) ## 3不起作用,还是输出整个表 ##print(data[:3]) 打印前3行不能直接[3] print(data)
import pandas as pd da= pd.read_excel('E:/test_data.xlsx') ### sheet_name=['列名','列名'],索引0开始 ##data=da.head() ##默认前5行 ## 指定返回某行列,要嵌套 #data=da.ix[[1,2],[0,1]].values ## 返回第2、3行第1、2列的值 #data=da.ix[[1,2],[0,1]] ## 会有字段名,若.values只返回数据向量形式 ##data=da.ix[0:2,1:4] ## 多行多列,不用嵌套,[1,2]返回第2行第3列 data=da.ix[:,['school','sex']] ## 返回所有行指定列
data=da.values #获取所有的数据 print("获取到所有的值:\n{0}".format(data))#格式化输出
2、数据抽样
## 1 简单随机抽样
import pandas as pd import numpy as np data= pd.read_excel('E:/test_data.xlsx') np.random.seed(seed=2) d1=data.sample(n=10,replace=True) d2=data.sample(frac=0.01,replace=True) print(d1,'\n',d2)
## 2 分层抽样
import pandas as pd import numpy as np data= pd.read_excel('E:/test_data.xlsx') df = pd.DataFrame(data) ## 不转也行 print(df) ds=data.groupby("sex") print(ds.groups) typicalFracDict = { "F": 0.01, "M": 0.01, } def typicalSampling(group, typicalFracDict): name = group.name frac = typicalFracDict[name] return group.sample(frac=frac) result = df.groupby( "sex",group_keys=True).apply(typicalSampling, typicalFracDict) print(result)
## 3 系统抽样
## 先了解向量形式转为数组(数据框)
### 试了很多方法,都无法实现 不显示索引 import pandas as pd data = pd.read_excel('E:/test_data.xlsx') da=[[1,2,3],[4,5,6]] #daa=pd.DataFrame(da).values ## 每月索引名,只返回数据,且数组形式 #daa=pd.DataFrame(da) ## 默认索引0开始 #daa=pd.DataFrame(da,columns=['a','b','c'],index=['row1','row2']) ## 索引重命名 daa=pd.DataFrame(da) print(daa)## 系统抽样
import random import pandas as pd def loadDataSet(fileName): #general function to parse tab -delimited floats dataMat = [] #assume last column is target value fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') # fltLine = map(float,curLine) #map all elements to float() dataMat.append(curLine) return dataMat def RandomSampling(dataMat, number): try: slice = random.sample(dataMat, number) return slice except: print('sample larger than population') def RepetitionRandomSampling(dataMat, number): sample = [] for i in range(number): sample.append(dataMat[random.randint(0, len(dataMat) - 1)]) return sample def SystematicSampling(dataMat, number): length = len(dataMat) k = int(length / number) sample = [] i = 0 if k > 0: while len(sample) != number: sample.append(dataMat[0 + i * k]) i += 1 return sample else: return RandomSampling(dataMat, number) if __name__ == '__main__': dataMat = loadDataSet('E:/tt.txt') ## 把test.xlsx转为txt # print RandomSampling(dataMat,7) # RepetitionSampling(dataMat,4) #print(SystematicSampling(dataMat, 60)) result=SystematicSampling(dataMat, 30) ## 取样数30 result=pd.DataFrame(result) print(result)