版权声明:请多多关注博主哟~ https://blog.csdn.net/qq_37865996/article/details/87830474
逻辑回归算法常常用来估计某种事物的可能性,可以用来回归,也可以用来分类。
ogistic回归虽然名字叫”回归” ,但却是一种分类学习方法。使用场景大概有两个:第一用来预测,第二寻找因变量的影响因素。逻辑回归(Logistic Regression, LR)又称为逻辑回归分析,是分类和预测算法中的一种。通过历史数据的表现对未来结果发生的概率进行预测。例如,我们可以将购买的概率设置为因变量,将用户的特征属性,例如性别,年龄,注册时间等设置为自变量。根据特征属性预测购买的概率。逻辑回归与回归分析有很多相似之处,在开始介绍逻辑回归之前我们先来看下回归分析。
回归分析用来描述自变量x和因变量Y之间的关系,或者说自变量X对因变量Y的影响程度,并对因变量Y进行预测。其中因变量是我们希望获得的结果,自变量是影响结果的潜在因素,自变量可以有一个,也可以有多个。一个自变量的叫做一元回归分析,超过一个自变量的叫做多元回归分析。
1.使用逻辑回归算法检测Java溢出攻击
# -*- coding:utf-8 -*-
import re
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import os
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model, datasets
def load_one_flle(filename):
x=[]
with open(filename) as f:
line=f.readline()
line=line.strip('\n')
return line
#加载ADFA-LD中正常样本数据
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
print "Load file(%s)" % path
y.append(0)
return x,y
#遍历目录下文件
def dirlist(path, allfile):
filelist = os.listdir(path)
for filename in filelist:
filepath = os.path.join(path, filename)
if os.path.isdir(filepath):
dirlist(filepath, allfile)
else:
allfile.append(filepath)
return allfile
#从攻击数据集中筛选和Java溢出攻击相关的数据
def load_adfa_java_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(r"/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/Java_Meterpreter_\d+/UAD-Java-Meterpreter*",file):
print "Load file(%s)" % file
x.append(load_one_flle(file))
y.append(1)
return x,y
if __name__ == '__main__':
#词集模型
x1,y1=load_adfa_training_files("/Users/zhanglipeng/Data/ADFA-LD/Training_Data_Master/")
x2,y2=load_adfa_java_files("/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/")
x=x1+x2
y=y1+y2
#print x
vectorizer = CountVectorizer(min_df=1)
x=vectorizer.fit_transform(x)
x=x.toarray()
mlp = MLPClassifier(hidden_layer_sizes=(150,50), max_iter=10, alpha=1e-4,
solver='sgd', verbose=10, tol=1e-4, random_state=1,
learning_rate_init=.1)
#C为正则系数
logreg = linear_model.LogisticRegression(C=1e5)
score=cross_val_score(logreg, x, y, n_jobs=-1, cv=10)
print np.mean(score)
正确率:0.9520895505516369为95%
2.识别验证码
# -*- coding:utf-8 -*-
import re
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as pltimport
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
import os
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model, datasets
import pickle
import gzip
def load_data():
with gzip.open('/Users/zhanglipeng/Data/MNIST/mnist.pkl.gz') as fp:
training_data, valid_data, test_data = pickle.load(fp)
return training_data, valid_data, test_data
if __name__ == '__main__':
training_data, valid_data, test_data=load_data()
x1,y1=training_data
x2,y2=test_data
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x1, y1)
print cross_val_score(logreg, x2, y2, scoring="accuracy")