import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.feature_selection import SelectFromModel,RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
dataset = datasets.load_breast_cancer()
X,y = dataset.data,dataset.target
def preProcessing(X):
scaler = preprocessing.StandardScaler()
inputVec = scaler.fit_transform(X)
return inputVec
inputVec = preProcessing(X)
def selectFeatureSubset(inputVec,y):
clf = ExtraTreesClassifier(n_estimators=15,criterion='gini',random_state=1)
clf.fit(inputVec,y)
print('feature score:\n',clf.feature_importances_)
model = SelectFromModel(clf,prefit=True)
tmp = model.transform(inputVec)
common = []
for index in range(inputVec.shape[1]):
if inputVec[0,index] in list(tmp[0]):
common.append(index)
print('feature index:',common)
model = RFE(LogisticRegression(),10)
model.fit(inputVec,y)
print('feature index:',list(np.arange(inputVec.shape[1])[model.support_]))
selectFeatureSubset(inputVec,y)
def selectModelBestParams(inputVec,y):
x = inputVec[:,[7,10,13,15,20,21,22,23,26,27]]
params = {
'C':[0.01,0.1,1.0,10,100,1000],
'penalty':['l1','l2']
}
model = GridSearchCV(LogisticRegression(),params)
model.fit(x,y)
print('best params:\n',model.best_params_)
selectModelBestParams(inputVec,y)
def trainModel(inputVec,y):
x = inputVec[:,[7,10,13,15,20,21,22,23,26,27]]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
clf = LogisticRegression(C=1.0,penalty='l2')
clf.fit(x_train,y_train)
train = clf.predict(x_train)
print('训练集:\n')
print('accuracy:',accuracy_score(y_train,train))
'''
accuracy = (162+283)/(162+283+3)=0.978
'''
print(classification_report(y_train,train,target_names=dataset.target_names))
print(pd.DataFrame(confusion_matrix(y_train,train)))
'''
准确率:
0 162/(162+2)=0.99
1 283/(283+8)=0.97
召回率:
0 162/(162+8)=0.95
1 283/(283+2)=0.99
f1-score:
0 2*162/(2*162+2+8)=0.97
1 2*283/(2*283+2+8)=0.98
'''
test = clf.predict(x_test)
print('测试集:\n')
print('accuracy:',accuracy_score(y_test,test))
'''
accuracy = (38+72)/(38+72+4)=0.96
'''
print(classification_report(y_test,test,target_names=dataset.target_names))
print(pd.DataFrame(confusion_matrix(y_test,test)))
fig,ax = plt.subplots(1,2,figsize=(20,4))
ax[0].scatter(np.arange(x_train.shape[0]),y_train-train,marker='.',color='lime',s=50)
ax[0].set_title('train set',fontsize=20,color='black',fontweight='bold')
ax[0].grid(linestyle='--',color='gray')
ax[0].legend(['predict-actual'],fontsize=16,facecolor='silver')
ax[1].scatter(np.arange(x_test.shape[0]),y_test-test,marker='.',color='blue',s=30)
ax[1].set_title('test set',fontsize=20,color='black',fontweight='bold')
ax[1].grid(linestyle='--',color='gray')
ax[1].legend(['predict-actual'],fontsize=16,facecolor='silver',loc='center right')
fig.savefig('lr.png')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.feature_selection import SelectFromModel,RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
dataset = datasets.load_breast_cancer()
X,y = dataset.data,dataset.target
def preProcessing(X):
scaler = preprocessing.StandardScaler()
inputVec = scaler.fit_transform(X)
return inputVec
inputVec = preProcessing(X)
def selectFeatureSubset(inputVec,y):
clf = ExtraTreesClassifier(n_estimators=15,criterion='gini',random_state=1)
clf.fit(inputVec,y)
print('feature score:\n',clf.feature_importances_)
model = SelectFromModel(clf,prefit=True)
tmp = model.transform(inputVec)
common = []
for index in range(inputVec.shape[1]):
if inputVec[0,index] in list(tmp[0]):
common.append(index)
print('feature index:',common)
model = RFE(LogisticRegression(),10)
model.fit(inputVec,y)
print('feature index:',list(np.arange(inputVec.shape[1])[model.support_]))
selectFeatureSubset(inputVec,y)
def selectModelBestParams(inputVec,y):
x = inputVec[:,[7,10,13,15,20,21,22,23,26,27]]
params = {
'C':[0.01,0.1,1.0,10,100,1000],
'penalty':['l1','l2']
}
model = GridSearchCV(LogisticRegression(),params)
model.fit(x,y)
print('best params:\n',model.best_params_)
selectModelBestParams(inputVec,y)
def trainModel(inputVec,y):
x = inputVec[:,[7,10,13,15,20,21,22,23,26,27]]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
clf = LogisticRegression(C=1.0,penalty='l2')
clf.fit(x_train,y_train)
train = clf.predict(x_train)
print('训练集:\n')
print('accuracy:',accuracy_score(y_train,train))
'''
accuracy = (162+283)/(162+283+3)=0.978
'''
print(classification_report(y_train,train,target_names=dataset.target_names))
print(pd.DataFrame(confusion_matrix(y_train,train)))
'''
准确率:
0 162/(162+2)=0.99
1 283/(283+8)=0.97
召回率:
0 162/(162+8)=0.95
1 283/(283+2)=0.99
f1-score:
0 2*162/(2*162+2+8)=0.97
1 2*283/(2*283+2+8)=0.98
'''
test = clf.predict(x_test)
print('测试集:\n')
print('accuracy:',accuracy_score(y_test,test))
'''
accuracy = (38+72)/(38+72+4)=0.96
'''
print(classification_report(y_test,test,target_names=dataset.target_names))
print(pd.DataFrame(confusion_matrix(y_test,test)))
fig,ax = plt.subplots(1,2,figsize=(20,4))
ax[0].scatter(np.arange(x_train.shape[0]),y_train-train,marker='.',color='lime',s=50)
ax[0].set_title('train set',fontsize=20,color='black',fontweight='bold')
ax[0].grid(linestyle='--',color='gray')
ax[0].legend(['predict-actual'],fontsize=16,facecolor='silver')
ax[1].scatter(np.arange(x_test.shape[0]),y_test-test,marker='.',color='blue',s=30)
ax[1].set_title('test set',fontsize=20,color='black',fontweight='bold')
ax[1].grid(linestyle='--',color='gray')
ax[1].legend(['predict-actual'],fontsize=16,facecolor='silver',loc='center right')
fig.savefig('lr.png')
plt.show()
trainModel(inputVec,y)