前言
做图像分类标注时,先标注一部分数据,然后计算这些图像的特征信息,见【代码篇】图像预处理阶段:提取图像特征,利用这些特征信息及分类标签训练一个决策树模型,最后再利用这个决策树模型分类剩下的数据,进而减少工作量。
训练决策树模型
import os
import pandas as pd
from sklearn import tree
import graphviz
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
import utils
root_dir = "../Image-Downloader-master/download_images/gan/emoji_combine"
excel_path = "img_combine_info_type.xls"
data = pd.read_excel(excel_path, index_col=0)
columns = ["size", "area", "gradient", "si", "niqe", "colorful"]
color = {
"lr": "red", "hr": "blue", "other": "yellow"}
for column in columns:
for type_now in color.keys():
data_type = data[data["type"] == type_now].drop("type", axis=1)
print(type_now, data_type)
if len(data_type) == 0:
continue
utils.draw_hist(data_type[column], type_now, column, color[type_now])
plt.show()
# sample_weight = {"hr": 4, "lr": 2, "other": 1}
clf = DecisionTreeClassifier(max_depth=3) # 初始化
clf = clf.fit(data[columns], data["type"]) # 拟合
joblib.dump(clf, 'clf.model')
os.environ["PATH"] += os.pathsep + 'd:/Program Files/Graphviz/bin/'
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=columns)
graph = graphviz.Source(dot_data)
graph.render("DecisionTree")
推理决策树模型
import os
import cv2
import shutil
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeClassifier
import utils
def move_data(data, root_dir, output_dir):
for i in range(len(data)):
file_name = data.index[i]
print(file_name)
file_path = os.path.join(root_dir, file_name)
output_path = os.path.join(output_dir, file_name)
shutil.copy(file_path, output_path)
root_dir = "../Image-Downloader-master/download_images/gan/emoji_combine"
hr_dir = "../Image-Downloader-master/download_images/gan/hr"
lr_dir = "../Image-Downloader-master/download_images/gan/lr"
other_dir = "../Image-Downloader-master/download_images/gan/other"
utils.check_dir(hr_dir)
utils.check_dir(lr_dir)
utils.check_dir(other_dir)
model = joblib.load('clf.model')
excel_path = "img_combine_info.xls"
data = pd.read_excel(excel_path, index_col=0)
data.dropna(inplace=True)
columns = ["size", "area", "gradient", "si", "niqe", "colorful"]
data["type"] = model.predict(data[columns])
temp = data["type"] == "hr"
hr = data[data["type"] == "hr"]
lr = data[data["type"] == "lr"]
other = data[data["type"] == "other"]
# hr = data[(data["gradient"] <= 91.5) | ((163 < data["gradient"]) & (data["gradient"] <= 183))]
hr = hr[(hr["width"] >= 256) & (hr["height"] >= 256)]
# other = data[(91.5 < data["gradient"]) & (data["gradient"] <= 163) & ((72.5 < data["si"]) & (data["si"] <= 82.5))]
# lr = data.drop(hr.index).drop(other.index)
move_data(hr, root_dir, hr_dir)
move_data(lr, root_dir, lr_dir)
move_data(other, root_dir, other_dir)