import random
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tarfile
from six.moves import urllib
# frameworks for ML
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
# transformers for category variables
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# transformers for numerical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer
# transformers for combined variables
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
# user-defined transformers
from sklearn.preprocessing import FunctionTransformer
# classification models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
data = pd.read_csv("C:/Users/july/Desktop/contest_basic_train.tsv", sep='\t')
data_se = data.drop(["REPORT_ID","ID_CARD","LOAN_DATE","AGENT","WORK_PROVINCE","Y"],axis=1)
data_label = data["Y"].copy()
def set_data(df):
df.loc[(df.HAS_FUND.isnull()), 'HAS_FUND'] = 0
df.loc[(df.EDU_LEVEL.isnull()), 'EDU_LEVEL'] = "专科"
return df
data_set = set_data(data_se)
imputer = Imputer(strategy="mean")
data_num =data_set["SALARY"].copy()
data_cat =data_set.drop("SALARY",axis=1)
class dataframese(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self,x,y=None):
return self
def transform(self,x):
return x[self.attribute_names].values
from categoricalencoder import CategoricalEncoder
num = ["SALARY"]
cat =list(data_cat)
num_pipeline = Pipeline([
('selector',dataframese(num)),
("imputer",Imputer(strategy="mean")),
('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
])
cat_pipeline = Pipeline([
('selector', dataframese(cat)),
('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
data_prepared =full_pipeline.fit_transform(data_set)
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_label)
data_predictions = tree_reg.predict(data_prepared)
tree_mse = mean_squared_error(data_label, data_predictions)
tree_rmse = np.sqrt(tree_mse)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, data_prepared,
data_label, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("scores:", scores)
print("mean:", scores.mean)
print("standard deviation:", scores.std())
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
forest_cla = RandomForestClassifier()
forest_cla.fit(data_prepared, data_label)
data_predictions2 = forest_cla.predict(data_prepared)
forest_mse = confusion_matrix(data_label, data_predictions2)
forest_scores = cross_val_score(forest_cla, data_prepared,
data_label, scoring="accuracy", cv=10)
东证期货比赛代码
猜你喜欢
转载自blog.csdn.net/weixin_41908529/article/details/81342336
今日推荐
周排行