class deals_abnormal_by_3delta():
'''
3*delta法处理异常值
属性:
self.data :data的浅复制
self.scale :默认为3
self.val_low:下边界
self.val_high:上边界
self.high_index:触犯了上边界的索引
self.low_index:触犯了下边界的索引
self.all_abnormal_index:所有有问题的索引
方法:
__init__
data:需要处理异常值的数据DataFrame(只保留有异常值的列)
scale:默认为3
remove_all_abnormal
返回:删除所有有问题行的数据
remove_specific_column
返回:删除制定列有问题行的数据
replace_all_abnormal_with_boundary
返回:用边界值替换所有有问题行的数据
replace_all_abnormal_with_nan
返回:用np.nan替换所有有问题行的数据
'''
def __init__(self,data,scale = 3):
self.data = data.copy()
self.scale = scale
iqr = (self.data.quantile(0.75) - self.data.quantile(0.25)) * self.scale
self.val_low = self.data.quantile(0.25) - iqr
self.val_high = self.data.quantile(0.75) + iqr
self.operate_columns = [x for x in self.data.columns ]
self.high_index = {}
self.low_index = {}
for i in self.operate_columns:
temp_data = self.data[i]
high_val = self.val_high[i]
low_val = self.val_low[i]
high_index = self.data[self.data[i] > high_val].index
low_index = self.data[self.data[i] < low_val].index
if len(high_index) > 0:
self.high_index[i] = high_index
if len(low_index) > 0:
self.low_index[i] = low_index
self.all_abnormal_index = []
for i in self.high_index.keys():
for j in self.high_index[i]:
self.all_abnormal_index.append(j)
for i in self.low_index.keys():
for j in self.low_index[i]:
self.all_abnormal_index.append(j)
self.all_abnormal_index = [x for x in set(self.all_abnormal_index)]
def remove_all_abnormal(self):
result = self.data.drop(self.all_abnormal_index,inplace = False)
return result
def remove_specific_column(self,col):
result = self.data.copy()
result.drop(self.high_index[col],inplace = True)
result.drop(self.low_index[col],inplace = True)
return result
def replace_all_abnormal_with_boundary(self):
result = self.data.copy()
for i in self.operate_columns:
if i in self.high_index.keys():
for j in self.high_index[i]:
result[i][j] = self.val_high[i]
if i in self.low_index.keys():
for j in self.low_index[i]:
result[i][j] = self.val_low[i]
return result
def replace_all_abnormal_with_nan(self):
result = self.data.copy()
for i in self.operate_columns:
if i in self.high_index.keys():
for j in self.high_index[i]:
result[i][j] = np.nan
if i in self.low_index.keys():
for j in self.low_index[i]:
result[i][j] = np.nan
return result
3detla法处理异常值
猜你喜欢
转载自blog.csdn.net/weixin_44414593/article/details/107298501
今日推荐
周排行