http://sklearn.apachecn.org/#/
类型 | 获取方式 |
---|---|
自带的小数据集 | sklearn.datasets.load_... |
在线下载的数据集 | sklearn.datasets.fetch_... |
计算机生成的数据集 | sklearn.datasets.make_... |
svmlight/libsvm格式的数据集 | sklearn.datasets.load_svmlight_file(...) |
mldata.org在线下载数据集 | sklearn.datasets.fetch_mldata(...) |
自带的小数据集:
- 鸢尾花数据集:
load_iris()
可用于分类 和 聚类 - 乳腺癌数据集:
load_breast_cancer()
可用于分类 - 手写数字数据集:
load_digits()
可用于分类 - 糖尿病数据集:
load_diabetes()
可用于分类 - 波士顿房价数据集:
load_boston()
可用于回归 - 体能训练数据集:
load_linnerud()
可用于回归 - 图像数据集:
load_sample_image(name)
计算机生成的数据集:
make_blobs
可用于聚类和分类make_classification
可用于分类make_circles
可用于分类make_moons
可用于分类make_multilabel_classification
可用于多标签分类make_regression
可用于回归
自带的小数据集
#导入数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris() #鸢尾花 for cl assification
digits = datasets.load_digits() #手写数字识别 for clas sification
boston = datasets.load_boston() #波士顿房价 for re gression
iris['data']
iris['target']
iris['feature_names']
iris['target_names']
计算机生成的数据集
# 生成数据 make_blobs
import pandas as pd
%matplotlib inline
import matplotlib
from sklearn.datasets.samples_generator import make_blobs
center=[[1,1],[-1,-1],[1,-1]]
cluster_std=0.3
X,labels=make_blobs(n_samples=200,centers=center,n_features=2, cluster_std=cluster_std,random_state = 0)
print('X.shape',X.shape)
print("labels",set(labels))
df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels'])
df['labels'] = df['labels'].astype('i2')
#mycolormap = matplotlib.colors.ListedColormap(['red','cyan','m agenta'], N=3)
#matplotlib常用colormap:'jet','rainbow','hsv'
df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']),
cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_blobs')
#生成数据集 make_cla ssification
from sklearn.datasets.samples_generator import make_classification
X,labels=make_classification(n_samples=300,n_features=2,n_classes = 2, n_redundant=0,n_informative=2, random_state=0,n_clusters_per_class=2)
rng = np.random.RandomState(2)
X+=2*rng.uniform(size=X.shape)
df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels'])
df['labels'] = df['labels'].astype('i2')
#mycolormap = matplotlib.colors.ListedColormap(['red','cyan','m agenta'], N=3)
#matplotlib常用colormap:'jet','rainbow','hsv'
df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']),
cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_classification')
#生成数据集 make_ circles
from sklearn.datasets.samples_generator import make_circles
X,labels=make_circles(n_samples=200,noise=0.2,factor=0.2,random_state=1)
print("X.shape:",X.shape)
print("labels:",set(labels))
df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels'])
df['labels'] = df['labels'].astype('i2')
df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']),
cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_circles')
# 生成数据集 make_ regression
from sklearn.datasets.samples_generator import make_regression
X,Y,coef = make_regression(n_samples=100, n_features=1, n_informative=1,
n_targets=1, bias=5, effective_rank=None, tail_strength= 0, noise= 10,
shuffle=True, coef=True, random_state=None)
df = pd.DataFrame(np.c_[X,Y],columns = ['x','y'])
# red:'r',green:'g',blue:'b',cyan:'c',magenta:'m'
df.plot('x','y',kind = 'scatter',s = 50,c = 'm',edgecolor = 'k')