1.数据搜集
加载KDD 99中的数据:
def load_kdd99(filename):
x=[]
with open(filename) asf:
for line in f:
line=line.strip('\n')
line=line.split(',')
x.append(line)
return x
筛选标记为guess-passwd和normal并且是POP3协议的数据:
if (x1[41] in ['guess_passwd','normal.']) and (x1[2]=='pop_3'):
if x1[41]=='guess_passwd.':
y.append(1)
else:
y.append(0)
2.特征化
x1 = [x1[0]]+x1[4:8]+x1[22:30]
v.append(x1)
for x1 in v:
v1 =[]
for x2 in x1:
v1.append(float(x2))
w.append(v1)
3.训练样本
clf=tree.DecisionTreeClassifier()
4.效果验证
print cross_validation.cross_val_score(clf,x,y,n_jobs=-1,cv=10)
5.可视化决策树
dot_data=tree.export_graphviz(clf,out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("...")