1、sklearn.datasets中的乳腺癌数据集
from sklearn.datasets import load_breast_cancer cancer = load_breast_cancer() print(cancer.keys()) # dict_keys(['data', 'target_names', 'feature_names', 'DESCR', 'filename', 'target']) print(cancer.data.shape) #(569, 30) import numpy as np print("sample counts per class: \n{}".format( {n:v for n,v in zip(cancer.target_names,np.bincount(cancer.target))})) #在569个数据点中,212个被标记为恶性,357被标记为良性 #sample counts per class: #{'benign': 357, 'malignant': 212} print(cancer.feature_names) ''' ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean smoothness' 'mean compactness' 'mean concavity' 'mean concave points' 'mean symmetry' 'mean fractal dimension' 'radius error' 'texture error' 'perimeter error' 'area error' 'smoothness error' 'compactness error' 'concavity error' 'concave points error' 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture' 'worst perimeter' 'worst area' 'worst smoothness' 'worst compactness' 'worst concavity' 'worst concave points' 'worst symmetry' 'worst fractal dimension'] ''' from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=66) print(X_train.shape,X_test.shape) print(y_train.shape,y_test.shape) training_accuracy = [] test_accuracy = [] neighbors_settings = range(1,11) from sklearn.neighbors import KNeighborsClassifier for n_neighbors in neighbors_settings: knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(X_train,y_train) training_accuracy.append(knn.score(X_train,y_train)) test_accuracy.append(knn.score(X_test,y_test)) import matplotlib.pylab as plt plt.plot(neighbors_settings,training_accuracy,label='training accuracy') plt.plot(neighbors_settings,test_accuracy,label='test_accuracy') plt.ylabel('accuracy') plt.xlabel('n_neighbors') plt.legend()
2、boston房价
from sklearn.datasets import load_boston boston = load_boston() print(boston.data.shape) #(506, 13)