理论推导
参考:回归(二):Logistic回归
实例
iris分类
"""
Author:wucng
Time: 20200114
Summary: 逻辑回归对iris数据分类
源代码: https://github.com/wucng/MLAndDL
参考:https://www.jianshu.com/p/ba60f232e9da
"""
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import scipy,pickle,os,time
import pandas as pd
def loadData(dataPath: str) -> tuple:
df = pd.read_csv(dataPath, sep=",", header=-1,
names=["sepal_length", "sepal_width", "petal_length", "petal_width", "label"])
df = df.fillna(0)
df.replace("Iris-setosa", 0, inplace=True)
df.replace("Iris-versicolor", 1, inplace=True)
df.replace("Iris-virginica", 2, inplace=True)
X = df.drop("label", axis=1)
y = df.label
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
return (X.to_numpy()[:100], y.to_numpy()[:100])
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_inv(y):
return np.log(y)-np.log(1-y)
class LogisticRegressionSelf(object):
"""
直接使用求伪逆
g(Xw)=y => Xw=g_inv(y) => w = pinv(X)*g_inv(y)
"""
def __init__(self,save_file="model.npy"):
self.save_file = save_file
def __fit(self,X,y):
X = np.hstack((np.ones((len(X), 1)), X))
w = np.dot(np.linalg.pinv(X), sigmoid_inv(y))
return w
def fit(self,X,y,batch_size=32,epochs=1):
if not os.path.exists(self.save_file):
length = len(y)
m = len(y)//batch_size
last_w = []
for epoch in range(epochs):
w = []
index = np.arange(0, length)
np.random.seed(epoch)
np.random.shuffle(index)
new_X = X[index]
new_y = y[index]
for i in range(m):
start = i*batch_size
end = min((i+1)*batch_size,length)
w.append(self.__fit(new_X[start:end],new_y[start:end]))
last_w.append(np.mean(w,0))
np.save(self.save_file,np.mean(last_w,0))
self.w = np.load(self.save_file)
def predict(self,X):
X = np.hstack((np.ones((len(X), 1)), X))
return (sigmoid(np.dot(X,self.w))>0.5).astype(np.float32)
def accuracy(self,y_true,y_pred):
return round(np.sum(y_pred==y_true)/len(y_true),5)
class LogisticRegressionSelf2(object):
"""梯度下降"""
def __init__(self,save_file="model.ckpt"):
self.save_file = save_file
def __fit(self,X,y,w,b,lr=1e-3):
diff = sigmoid(np.dot(X, w) + b) - y
w-=lr*(1/len(y))*(np.dot(np.transpose(X), diff))
b-=lr*np.mean(diff)
return w,b
def fit(self,X,y,batch_size=32,epochs=5000,lr=5e-4):
if not os.path.exists(self.save_file):
length = len(y)
m = len(y)//batch_size
w = np.random.random((len(X[0]),1))
b = np.random.random((1,1))
for epoch in range(epochs):
index = np.arange(0, length)
np.random.seed(epoch)
np.random.shuffle(index)
new_X = X[index]
new_y = y[index]
for i in range(m):
start = i*batch_size
end = min((i+1)*batch_size,length)
w,b = self.__fit(new_X[start:end],new_y[start:end],w,b,lr)
pickle.dump({"w":w,"b":b},open(self.save_file,"wb"))
data = pickle.load(open(self.save_file,"rb"))
self.w = data["w"]
self.b = data["b"]
def predict(self,X):
return (sigmoid(np.dot(X,self.w)+self.b)>0.5).astype(float)
def accuracy(self, y_true, y_pred):
return round(np.sum(y_pred == y_true) / len(y_true), 5)
if __name__=="__main__":
dataPath = "../../dataset/iris.data"
X, y = loadData(dataPath)
y[y==0]=0.2
y[y==1]=0.8
if len(y.shape)==1:y=y[...,None]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
start =time.time()
clf = LogisticRegressionSelf()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("cost time:%.5f acc:%.5f"%(time.time()-start,clf.accuracy((y_test > 0.5).astype(float), y_pred)))
X, y = loadData(dataPath)
if len(y.shape)==1:y=y[...,None]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
start = time.time()
clf = LogisticRegressionSelf2()
clf.fit(X_train,y_train,batch_size=16,epochs=1000,lr=1e-3)
y_pred = clf.predict(X_test)
print("cost time:%.5f acc:%.5f"%(time.time()-start,clf.accuracy(y_test, y_pred)))
start = time.time()
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("cost time:%.5f acc:%.5f" % (time.time() - start, accuracy_score(y_test,y_pred)))
titanic分类
"""
Author:wucng
Time: 20200114
Summary: 逻辑回归对titanic数据分类
源代码: https://github.com/wucng/MLAndDL
参考:https://www.jianshu.com/p/ba60f232e9da
"""
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import scipy,pickle,os,time
import pandas as pd
def loadData(dataPath: str) -> tuple:
df = pd.read_csv(dataPath, sep=",")
df["Age"] = df["Age"].fillna(df["Age"].median())
df['Embarked'] = df['Embarked'].fillna('S')
df.replace("male", 0, inplace=True)
df.replace("female", 1, inplace=True)
df.loc[df["Embarked"] == "S", "Embarked"] = 0
df.loc[df["Embarked"] == "C", "Embarked"] = 1
df.loc[df["Embarked"] == "Q", "Embarked"] = 2
X = df.drop(["PassengerId","Survived","Name","Ticket","Cabin"], axis=1)
y = df.Survived
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
return (X.to_numpy(), y.to_numpy())
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_inv(y):
return np.log(y)-np.log(1-y)
class LogisticRegressionSelf(object):
"""
直接使用求伪逆
g(Xw)=y => Xw=g_inv(y) => w = pinv(X)*g_inv(y)
"""
def __init__(self,save_file="model.npy"):
self.save_file = save_file
def __fit(self,X,y):
X = np.hstack((np.ones((len(X), 1)), X))
w = np.dot(np.linalg.pinv(X), sigmoid_inv(y))
return w
def fit(self,X,y,batch_size=32,epochs=1):
if not os.path.exists(self.save_file):
length = len(y)
m = len(y)//batch_size
last_w = []
for epoch in range(epochs):
w = []
index = np.arange(0, length)
np.random.seed(epoch)
np.random.shuffle(index)
new_X = X[index]
new_y = y[index]
for i in range(m):
start = i*batch_size
end = min((i+1)*batch_size,length)
w.append(self.__fit(new_X[start:end],new_y[start:end]))
last_w.append(np.mean(w,0))
np.save(self.save_file,np.mean(last_w,0))
self.w = np.load(self.save_file)
def predict(self,X):
X = np.hstack((np.ones((len(X), 1)), X))
return (sigmoid(np.dot(X,self.w))>0.5).astype(np.float32)
def accuracy(self,y_true,y_pred):
return round(np.sum(y_pred==y_true)/len(y_true),5)
class LogisticRegressionSelf2(object):
"""梯度下降"""
def __init__(self,save_file="model.ckpt"):
self.save_file = save_file
def __fit(self,X,y,w,b,lr=1e-3):
diff = sigmoid(np.dot(X, w) + b) - y
w-=lr*(1/len(y))*(np.dot(np.transpose(X), diff))
b-=lr*np.mean(diff)
return w,b
def fit(self,X,y,batch_size=32,epochs=5000,lr=5e-4):
if not os.path.exists(self.save_file):
length = len(y)
m = len(y)//batch_size
w = np.random.random((len(X[0]),1))
b = np.random.random((1,1))
for epoch in range(epochs):
index = np.arange(0, length)
np.random.seed(epoch)
np.random.shuffle(index)
new_X = X[index]
new_y = y[index]
for i in range(m):
start = i*batch_size
end = min((i+1)*batch_size,length)
w,b = self.__fit(new_X[start:end],new_y[start:end],w,b,lr)
pickle.dump({"w":w,"b":b},open(self.save_file,"wb"))
data = pickle.load(open(self.save_file,"rb"))
self.w = data["w"]
self.b = data["b"]
def predict(self,X):
return (sigmoid(np.dot(X,self.w)+self.b)>0.5).astype(float)
def accuracy(self, y_true, y_pred):
return round(np.sum(y_pred == y_true) / len(y_true), 5)
if __name__=="__main__":
dataPath = "../../dataset/titannic/train.csv"
X, y = loadData(dataPath)
y[y==0]=0.2
y[y==1]=0.8
if len(y.shape)==1:y=y[...,None]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)
start =time.time()
clf = LogisticRegressionSelf()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("cost time:%.5f acc:%.5f"%(time.time()-start,clf.accuracy((y_test > 0.5).astype(float), y_pred)))
X, y = loadData(dataPath)
if len(y.shape)==1:y=y[...,None]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)
start = time.time()
clf = LogisticRegressionSelf2()
clf.fit(X_train,y_train,batch_size=16,epochs=1000,lr=1e-3)
y_pred = clf.predict(X_test)
print("cost time:%.5f acc:%.5f"%(time.time()-start,clf.accuracy(y_test, y_pred)))
start = time.time()
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("cost time:%.5f acc:%.5f" % (time.time() - start, accuracy_score(y_test,y_pred)))