import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
sns.set_style('darkgrid')
plt.rcParams['font.sans-serif']=['SimHei']
standard =[]#将每个特征的取值范围提取出来,存成列表for each in data4['取值范围']:
v = each.split('_')
standard.append(v)
standard_val =[]for row in standard:
a =[]for j in row:
a.append(eval(j))
standard_val.append(a)
standard_dic ={
}for i,each inenumerate(data4['位号']):
standard_dic[each]= standard_val[i]
standard_result ={
}#检测超出或低于特征规定范围的样本for k,v in standard_dic.items():
col_val = data1[k]
thre1 = v[0]
thre2 = v[1]
index =[]for inx,j inenumerate(col_val):if j>thre2 or j<thre1:
index.append(inx)
standard_result[k]=index
bad_features ={
}#提取出超出范围数不为0的特征for k,v in standard_result.items():iflen(v)!=0:
bad_features[k]=v
#绘制特征超出范围的样本的数量
bad_feature_names =list(bad_features.keys())
bad_feature_vals =[]for i in bad_features.values():
n =len(i)
bad_feature_vals.append(n)print(bad_feature_vals)
bad_features_total = pd.DataFrame({
'Bad Features':bad_feature_names,'Number of bad features':bad_feature_vals})
bad_features_total_sort = bad_features_total.sort_values(by='Number of bad features',ascending=False)
plt.figure(figsize=(10,8),dpi=100)
sns.barplot('Bad Features','Number of bad features',data=bad_features_total_sort)
plt.xticks(rotation=90)# plt.title('异常值占比',fontsize=16,fontweight='bold')
plt.xlabel('特征名',fontsize=14,fontweight='bold')
plt.ylabel('异常值数',fontsize=14,fontweight='bold')
plt.savefig('./异常值占比.jpg')
plt.show()
X = data1_copy.drop('RON损失\n(不是变量)',axis=1)
3.2 找出含0值较多的特征
defmissing_data(data):"""将原始数据集中为0的值全部转为nan
Input:
data:原始数据
return:
data_:缺失值转化后的数据集
"""
columns =list(data.columns)
index_list={
}for each in columns:
index=[]
col = data[each]for inx,v inenumerate(col):if v ==0:
index.append(inx)
index_list[each]=index
final_index ={
}for key in index_list.keys():iflen(index_list[key])!=0:
final_index[key]= index_list[key]
data_ = data
for each in final_index.keys():
data_[each].iloc[final_index[each]]= np.nan
return data_
if __name__ =="__main__":
Data = missing_data(data1)print('Data Size:{}'.format(Data.shape))print('----------------------------------------------------------------------------------------------')print('Missing proportion:\n',Data.isnull().mean().sort_values(ascending=False).head(33))
Missing_proportion = pd.DataFrame({
'Proportion':Data.isnull().mean().sort_values(ascending=False).head(32)})
plt.figure(figsize=(12,8),dpi=100)
plt.rcParams['font.sans-serif']=['SimHei']
sns.barplot(Missing_proportion.index,Missing_proportion.Proportion)# plt.title('含零特征中的零值占比',fontsize=16,fontweight='bold')
plt.xlabel('特征名',fontsize=12,fontweight='bold')
plt.ylabel('比例',fontsize=12,fontweight='bold')
plt.xticks(rotation=90,fontsize=8)
plt.savefig('./Missing proportion.jpg')
plt.show()
3.3 删除含10%以上零值的特征和含5个异常值以上的特征
bad_feature_names =list(bad_features.keys())
bad_feature_vals =[]for i in bad_features.values():
n =len(i)
bad_feature_vals.append(n)
bad_features_total = pd.DataFrame({
'Bad Features':bad_feature_names,'Number of bad features':bad_feature_vals})
bad_features_total_sort = bad_features_total.sort_values(by='Number of bad features',ascending=False)
missing_name =list(Missing_proportion.index)
missing_standard =[]for each in missing_name:
thre = standard_dic[each]
thre1 = thre[0]
thre2 = thre[1]if(thre1<0and thre2>=0)or(thre1==0):
missing_standard.append(each)
miss_list =list(Missing_proportion.loc[Missing_proportion['Proportion']>0.1].index)# 删除0值占10%以上的特征
outlinear =list(bad_features_total_sort.loc[bad_features_total_sort['Number of bad features']>5]['Bad Features'])# 删除按工艺标准检测出来包含5个样本以上的特征
delete_feature =set([])# 提取要删除的特征for m in miss_list:
delete_feature.add(m)for o in outlinear:
delete_feature.add(o)
X_drop = X.drop(delete_feature,axis=1)
C =[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2]
mse_list =[]for c in C:
svr = SVR(kernel='rbf',C=c)
svr.fit(X_train_2,y_train_2)
pred = svr.predict(X_test_2)
mse_list.append(mean_squared_error(y_test_2,pred))
plt.plot(C,mse_list)
plt.show()