构建决策树和svm模型(某金融数据集)

根据金融数据集作出的决策树和svm模型

# 导入需要的包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score,  f1_score
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from  sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# 因为数据并非utf-8编码,要使用gbk编码读入,否则出错
data = pd.read_csv('./data.csv', index_col=0, encoding='gbk')


# 单独提取出y标签列,和其余的88列标记为X
y = data['status']
X = data.drop('status', axis=1)
#X值的行列数,以及y的分布类型
print('X.shape:', X.shape)
print('y 的分布\n', y.value_counts())
X.shape: (4754, 88)
y 的分布
 0    3561
1    1193
Name: status, dtype: int64
# 首先剔除一些明显无用的特征,如 id_name, custid, trade_no, bank_card_no,
X.drop(['id_name', 'custid', 'trade_no', 'bank_card_no'], axis=1, inplace=True)
print(X.shape)
(4754, 84)
# 选取数值型特征
X_num = X.select_dtypes('number').copy()
print(X_num.shape)
type(X_num.mean())
(4754, 80)





pandas.core.series.Series
#使用均值填充缺失值
X_num.fillna(X_num.mean(), inplace=True)

#观察除数值型以外的变量
X_str = X.select_dtypes(exclude='number').copy()
X_str.describe()
reg_preference_for_trad source latest_query_time loans_latest_time
count 4752 4754 4450 4457
unique 5 1 207 232
top 一线城市 xs 2018-04-14 2018-05-03
freq 3403 4754 423 134
#把reg_preference用虚拟变量代替,其它三个变量删除
X_str['reg_preference_for_trad'] = X_str['reg_preference_for_trad'].fillna(X_str['reg_preference_for_trad'].mode()[0])
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])
X_str_dummy.head()
#X_str.drop(['latest_query_time'],axis=1,inplace=True)
一线城市 三线城市 二线城市 其他城市 境外
5 1 0 0 0 0
10 1 0 0 0 0
12 1 0 0 0 0
13 0 1 0 0 0
14 1 0 0 0 0
X_cl = pd.concat([X_num, X_str_dummy], axis=1, sort=False)
X_cl.shape

(4754, 85)
#以三七比例分割训练集和测试集
random_state = 1115
X_train, X_test, y_train, y_test = train_test_split(X_cl, y, test_size=0.3, random_state=random_state)
print(X_train.shape)
print(X_test.shape)

(3327, 85)
(1427, 85)
#svc模型,不明白尝试了svc和lin_svc做出的预测f1都是0。只是因为没有数据预处理时候归一化?
"""
svc = SVC(C=1.0, kernel='rbf', gamma=0.1)
svc.fit(X_train, y_train)

#lin_svc模型
Lin_SVC = LinearSVC()
Lin_SVC.fit(X_train,y_train)
"""
#决策树模型

clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
#评估
# 准确性
"""
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)
"""
"""
#lin_svc
y_train_pred = Lin_SVC.predict(X_train)
y_test_pred = Lin_SVC.predict(X_test)
"""
#决策树

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print('准确性:')
print('训练集:{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(f1_score(y_test, y_test_pred)))
print('ROC AUC:')
print('训练集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))

准确性:
训练集:0.4083
测试集:0.3992
ROC AUC:
训练集:0.6227
测试集:0.6166

问题:
不明白svm以及线性svm作出的预测f1-score为什么是0,只是因为数据没有归一化么?

猜你喜欢

转载自blog.csdn.net/qq_41205464/article/details/84169197