# -*- coding: utf-8 -*- """ Created on Wed Apr 25 17:07:55 2018 @author: eagle """ import pandas as pd import numpy as np import requests import io from sklearn.cross_validation import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier from sklearn.metrics import classification_report #创建特征列表 column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'] #使用pandas.read_csv函数从互联网读取指定数据 data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names) #print(data.shape) #将?替换为标准缺失值null data = data.replace('?',value = np.nan) #丢弃缺失值(只要有一个维度上缺失) data = data.dropna(how='any') #查看data的数据量和维度 #data.shape #准备训练、测试数据 #随机采用,25%用于测试,75%用于训练 X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size = 0.25,random_state = 33) #训练数据类别 class_train = y_train.value_counts() #测试数据类别 class_test = y_test.value_counts() #print(data.shape) #print(data) #将?替换为标准缺失值 #data=data.replace(to_replace='?',np.nan) # ============================================================================= # 训练及预测 # ============================================================================= #标准化数据,保证每个维度的特征数据方差为1,均值为0 ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) #初始化Logistic回归和SGDClassifier lr = LogisticRegression() sgdc = SGDClassifier() #使用LogisticRegression训练模型参数并预测 lr.fit(X_train,y_train) lr_y_predict = lr.predict(X_test) #使用SGDClassfier训练并预测 sgdc.fit(X_train,y_train) sgdc_y_predict = sgdc.predict(X_test) # ============================================================================= # 性能分析(准确率、召回率) # ============================================================================= #获得准确率(预测准确/所有样本) print ('Accuracy of LR Classifier:',lr.score(X_test,y_test)) #获取其他三个指标(召回率、精确率及F1) #召回率=真阳/(真阳+假阴) #精确率=真阳/(真阳+假阳) #F1=2/(1/召回率+1/精确率) print(classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant'])) print ('Accuracy of SGD Classifier:',sgdc.score(X_test,y_test)) #获取其他三个指标(召回率、精确率及F1) #召回率=真阳/(真阳+假阴) #精确率=真阳/(真阳+假阳) #F1=2/(1/召回率+1/精确率) print(classification_report(y_test,sgdc_y_predict,target_names=['Benign','Malignant']))
上述是实现《PYTHON机器学习及实践》书中的代码,不过从运行结果看,使用随机梯队下降,每次运行的结果是有差别的。