from numpy import *
def loadDataSet(): #讲文本中的数据解析成矩阵
dataMat=[];labelMat=[]
fr=open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split() #每个数据变成单个的字符串,存放在lineArr列表里
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #dataMat列表的每个元素是列表
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
def sigmoid(inX):
return 1.0/(1+exp(-inX))
def gradAscent(dataMatIn,classLabels):
dataMatrix = mat(dataMatIn) #转化成numpy中的matrix结构
labelMat = mat(classLabels).transpose() #转置成n*1的矩阵
m,n=shape(dataMatrix)
alpha = 0.001 #设置步长
maxCycles = 500 #设置最大迭代次数
weights = ones((n,1)) #n行1列的array数组,这是权重系数的初始值
for k in range(maxCycles):
h = sigmoid(dataMatrix*weights) #h是一个列向量
error=(labelMat-h)
weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
return weights
def plotBestFit(weights):
import matplotlib.pylob as plt
dataMat,labelMat = loadDataSet()
dataArr = array(dataMat) #将列表变成array数组
n = shape(dataArr)[0] #shape返回的是一个元祖,n代表样本数
xcord1=[];ycord1=[] #正类
xcord2=[];ycord2=[] #负类
for i in range(n):
if int(labelMat[i]==1):
xcord1.append(dataArr[i,1]) #第i+1行,第2列的元素(代表第一个特征)
ycord1.append(dataArr[i,2]) #第i+1行,第3列的元素(代表第二个特征的值)
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure() #生成一块新的画布
ax = fig.add_subplot(111) #在画布fig中生成一个ax的画图区
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') #画散点图
ax.scatter(xcord2, ycord2, s=30, c='green')#参数s是size,参数maker是形状
#到此,样本点的散点图已经画好,下面画分类直线
x=arange(-3.0, 3.0, 0.1)
y=(-weights[0]-weights[1]*x)/weights[2]
# w0+w1*x+w2*y=0 => y = (-w0-w1*x)/w2
ax.plot(x,y) #在ax的画图区内画直线
plt.xlabel('X1') ; plt.ylabel('X2')
plt.show()
def stocGradAscent0(dataMatrix,classLabels):
m,n=shape(dataMatrix)
alpha=0.01
weights = ones(n) #1行n列
for i in range(m): #m个样本
h=sigmoid(sum(dataMatrix[i]*weights)) #每次只计算一个样本,则h就是一个数字,非向量
error=classLabels[i]-h
weights = weights+alpha*error*dataMatrix[i] #dataMatrix的结构为numpy数组
return weights
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
m,n=shape(dataMatrix)
weights = ones(n)
for j in range(numIter): #在整个数据集上的迭代
dataIndex = range(m)
for i in range(m): #对于每一个样本
alpha = 4/(1.0+j+i)+0.01 #设置一个函数,步长随着迭代次数的进行,会慢慢减小
randIndex = int(random.uniform(0,len(dataIndex))) #对于m个样本,不是依次更新,而是每次随机选取一个样本进行更新迭代操作
h=sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del(dataIndex[randIndex]) #将用来更新过权重的样本从原样本中去掉,下一次再随机选取一个新的样本用来参与迭代操作
return weights
def classifyVector(inX,weights): #输入是inX,和已求出的回归系数weights
prob = sigmoid(sum(inX*weights))
if prob > 0.5: return 1.0
else: return 0.0
def colicTest():#对数据进行格式化处理,同时测试算法的准确率
frTrain = open('horseColicTraining.txt')
frTest = open('horseColicTest.txt')
trainingSet=[]; trainingLabels=[]
for line in frTrain.readlines():
currLine = line.strip().split('\t')#分解成单个的字符串列表
lineArr=[]
for i in range(21): #有21个特征
lineArr.append(float(currLine[i])) #lineArr是一个浮点型的列表
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet),trainingLabels,500): #整个数据集上迭代500次
errorCount = 0.0; numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr),trainWeights))!= int(currLine[21]):
errorCount += 1
errorRate = float(errorCount)/numTestVec
print "the error rate of this test is: %f" % errorRate
return errorRate
def multiTest(): #调用colicTest函数10次,并计算错误率的平均值
numTests = 10; errorSum=0.0
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))
机器学习实战---Logistic回归
猜你喜欢
转载自blog.csdn.net/carl95271/article/details/80765916
今日推荐
周排行