【背景】
为什么要自己手写YOLO目标识别的图像后处理部分呢?因为我们使用了一款自研的FPGA开发板,并在其上开发部署了YOLO目标识别加速计算电路,作为PS端开发的我,需要把输入图像送到指定DDR地址,开启FPGA加速计算寄存器,等待计算结果发起中断,取回目标识别结果原始数据,经过后处理操作对其进行还原,并将结果标注到原始图像上。
【干货】
话不多说,直接上代码!
通过下面这份代码,你至少应该学会以下几个操作:
1、学会如何从文件夹里读取数据集图片
2、学会如何将FPGA结果反量化,解码,极大值抑制等基本操作
3、学会如何将识别结果标注,保存
4、学会通过操作寄存器控制FPGA的运转
static char text[256];
static int length = 5+5;
static int kind_nums = 5;
static float anchors_0[6] = {2.53125f, 2.5625f, 4.21875f, 5.28125f, 10.75f, 9.96875f};
static float anchors_1[6] = {1.4375f, 1.6875f, 2.3125f, 3.625f, 5.0625f, 5.125f};
static float conf_thresh_ = 0.5f;
static float nms_thresh_ = 0.4f;
static std::string label_name[] = {"person", "aeroplane", "bus", "car", "truck"};
bool PedestrianDetection::get_files_list()
{
struct dirent *ptr;
DIR *dir;
std::string PATH = dataset_dir_;
dir = opendir(PATH.c_str());
while((ptr=readdir(dir)) != nullptr)
{
if(ptr->d_name[0] == '.')
continue;
files_list.push_back(ptr->d_name);
}
for(auto iter : files_list)
{
printf("%s\n", iter.c_str());
}
closedir(dir);
if(files_list.size() == dataset_size)
{
return true;
}
else {
printf("[PedestrianDetection] : Error to search dataset %ld \n", files_list.size());
return false;
}
}
bool PedestrianDetection::init_net_info(std::string pkg_path)
{
yolov4tn = 2;
outputchannel = 30; // class number change
yolov4tnsize = {11, 15, 22, 30}; // class number change
yolov4tnsize_sum = yolov4tnsize[0] * yolov4tnsize[1] + yolov4tnsize[2] * yolov4tnsize[3];
accData_size = yolov4tnsize_sum * outputchannel;
accRaw_size = yolov4tnsize_sum * (outputchannel + 2);
return true;
}
void PedestrianDetection::init_fpga()
{
raw_img_w = 480;
raw_img_h = 352;
raw_img_c = 3;
memset(&yolo_v4, 0, sizeof(struct Model_Output));
yolo_v4.addr[0] = 0x4491F800;
yolo_v4.length[0] = 11 * 15 * (30+2);
yolo_v4.addr[1] = 0x44921000;
yolo_v4.length[1] = 22 * 30 * (30+2);
if(init_model_output(&yolo_v4, fd_mem) == -1)
{
printf("[yolov4] failed to init model output \n");
exit(-1);
}
}
void PedestrianDetection::read_dataset()
{
while(nh_private.ok())
{
while(ready_buf_.size() > 50)
{
std::this_thread::sleep_for(std::chrono::milliseconds(5));
}
#ifdef WITH_NETCAM
cv::Mat origin;
struct timeval timestamp;
if(netcam.getCameraImage(origin, timestamp, cv::Size(1920, 1080)))
{
std::vector<cv::Mat> pics;
cut_img(origin, 2, 2, pics);
for(int i = 0; i < pics.size(); i++)
{
if(i != (pics.size()-1))
{
std::vector<uint8_t> vec_data;
cv::imencode(".jpg", pics[i], vec_data);
std::string str_data;
str_data.assign(vec_data.begin(), vec_data.end());
zmqpp::message message;
message << "robot" + std::to_string(i+1) +"pic" << str_data;
socket_pubraw.send(message);
}
else {
cv::Mat resize;
cv::resize(pics[i], resize, cv::Size(raw_img_w, raw_img_h), cv::INTER_CUBIC); // cv::INTER_CUBIC
cv::cvtColor(resize, resize, cv::COLOR_BGR2RGB);
cv::Mat padding;
cv::copyMakeBorder(resize, padding, 1, 0, 1, 0, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
rawImgFile tmp;
tmp.raw_img = pics[i];
tmp.pad_img = padding;
tmp.file_name = robot_id;
ready_buf_mutex_.lock();
ready_buf_.push(tmp);
ready_buf_mutex_.unlock();
}
}
}
else {
printf("camera no data\n");
usleep(30*1000);
continue;
}
#else
zmqpp::message message;
socket_sub.receive(message);
std::string topic;
std::string data_str;
message >> topic >> data_str;
std::vector<uint8_t> data_vec;
data_vec.assign(data_str.begin(), data_str.end());
cv::Mat img_decode;
img_decode = cv::imdecode(data_vec, CV_LOAD_IMAGE_COLOR);
cv::Mat resize;
cv::resize(img_decode, resize, cv::Size(raw_img_w, raw_img_h), cv::INTER_CUBIC); // cv::INTER_CUBIC
cv::cvtColor(resize, resize, cv::COLOR_BGR2RGB);
cv::Mat padding;
cv::copyMakeBorder(resize, padding, 1, 0, 1, 0, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
rawImgFile tmp;
tmp.raw_img = img_decode;
tmp.pad_img = padding;
tmp.file_name = robot_id;
ready_buf_mutex_.lock();
ready_buf_.push(tmp);
ready_buf_mutex_.unlock();
#endif
}
}
void PedestrianDetection::test_fpga()
{
while(nh_private.ok())
{
while(nh_private.ok())
{
if(ready_buf_.empty())
{
std::this_thread::sleep_for(std::chrono::milliseconds(5));
}
else
{
break;
}
}
rawImgFile tmp_input;
ready_buf_mutex_.lock();
tmp_input = ready_buf_.front();
ready_buf_.pop();
ready_buf_mutex_.unlock();
tmp_input.acc.acc_out = nullptr;
tmp_input.acc.acc_out = new int8_t[accRaw_size];
if(!tmp_input.acc.acc_out)
{
ROS_ERROR("Allocation of memory Failed \n");
return;
}
int image_size = 481 * 353 * 3;
image_to_mem(tmp_input.pad_img.data, 0x46000000, fd_mem, image_size);
// TODO : start fpga
void* mem_ctrl = mmap(nullptr, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd_mem, 0x0400000000L);
int state = 0;
while(state != 1)
{
memcpy(&state, (char*)mem_ctrl+124, 4);
}
state = 1;
memcpy((char*)mem_ctrl, &state, 4);
int res = -1;
memcpy(&res, (char*)mem_ctrl, 4);
state = 0;
memcpy((char*)mem_ctrl, &state, 4);
memcpy(&res, (char*)mem_ctrl, 4);
int busy = 0;
memcpy(&busy, (char*)mem_ctrl+8, 4);
int busy_prev = busy;
while(busy_prev != 0)
{
memcpy(&busy, (char*)mem_ctrl+8, 4);
if(busy != busy_prev)
{
busy_prev = busy;
}
}
munmap(mem_ctrl, 4096);
get_model_output(&yolo_v4, tmp_input.acc.acc_out);
result_buf_mutex_.lock();
result_buf_.push(tmp_input);
result_buf_mutex_.unlock();
}
}
void PedestrianDetection::write_predict()
{
while(nh_private.ok())
{
while(nh_private.ok())
{
if(result_buf_.empty())
{
std::this_thread::sleep_for(std::chrono::milliseconds(5));
}
else
{
break;
}
}
rawImgFile tmp_output;
result_buf_mutex_.lock();
tmp_output = result_buf_.front();
result_buf_.pop();
result_buf_mutex_.unlock();
decode(tmp_output);
delete [] tmp_output.acc.acc_out;
tmp_output.acc.acc_out = nullptr;
}
}
static bool outbin_write = true;
void PedestrianDetection::decode(rawImgFile &msg)
{
cv::Mat frame = msg.raw_img;
if(msg.acc.acc_out != nullptr)
{
int8_t* data;
data = msg.acc.acc_out;
if(outbin_write)
{
FILE* fp = fopen((pkg_path + "/unit_test/result_img/out.bin").c_str(), "wb");
fwrite(data, sizeof(int8_t), accRaw_size, fp);
outbin_write = false;
}
float *out = (float*)calloc(accData_size, sizeof(float));
unpadding(data, out, 32, 2);
std::vector<de_result> res;
process_decode(out, res, frame);
draw_objects(frame, res);
#if 1
draw_objects(msg.raw_img, res);
std::vector<uint8_t> vec_data;
cv::imencode(".jpg", msg.raw_img, vec_data);
std::string str_data;
str_data.assign(vec_data.begin(), vec_data.end());
zmqpp::message message;
message << "pedestrian_detect" << str_data;
socket_pub.send(message);
// cv::imwrite((pkg_path + "/unit_test/result_img/" + img_name +"jpg"), msg.raw_img);
/*
std::ofstream outfile;
outfile.open((pkg_path + "/unit_test/result_label/" + img_name +"txt"));
for(size_t i = 0; i < res.size(); i++)
{
std::string str_conf = std::to_string(res[i].complex_prob);
std::string conf_4(str_conf.begin(), str_conf.begin()+5);
outfile << label_name[res[i].index] << " " << conf_4 << " " << static_cast<int>(res[i].x - res[i].w/2) << " " << static_cast<int>(res[i].y - res[i].h/2) << " " << static_cast<int>(res[i].x + res[i].w/2) << " " << static_cast<int>(res[i].y + res[i].h/2);
if(i != (res.size()-1))
outfile << std::endl;
}
outfile.close();*/
/*
FILE* fp = fopen((pkg_path + "/unit_test/result_bin/" + img_name +"bin").c_str(), "wb");
fwrite(data, sizeof(int8_t), 11 * 15 * 96 + 22 * 30 * 96, fp);
fclose(fp);
*/
#endif
free(out);
}
else
{
printf("acc is empty \n");
}
}
void PedestrianDetection::unpadding(int8_t* input, float* output, int ic ,int padding)
{
int offset = yolov4tnsize[0] * yolov4tnsize[1];
for(int i = 0; i < yolov4tnsize_sum; i++)
{
if(i < offset)
{
for(int j = 0; j < ic - padding; j++)
{
int tmp = *(input + j);
*(output++) = (tmp - 78) * 0.1175554f;
}
}
else
{
for(int j = 0; j < ic - padding; j++)
{
int tmp = *(input + j);
*(output++) = (tmp - 88) * 0.1006139f;
}
}
input += ic;
}
}
inline float PedestrianDetection::sigmoid(const float &x)
{
return (1 / (1 + expf(-x)));
}
inline int PedestrianDetection::find_max_index(float *buf, int len)
{
int k = 0;
float max = *buf;
for (int i = 1; i < len; ++i)
{
if (buf[i] > max)
{
k = i;
max = buf[i];
}
}
return k;
}
bool comp_prob(const de_result &a, const de_result &b)
{
return a.complex_prob > b.complex_prob;
}
bool comp(const de_result &a, const de_result &b)
{
return a.index > b.index;
}
inline float PedestrianDetection::clac_iou(const de_result &A, const de_result &B)
{
float aminx = A.x - A.w / 2.f;
float amaxx = A.x + A.w / 2.f;
float aminy = A.y - A.h / 2.f;
float amaxy = A.y + A.h / 2.f;
float bminx = B.x - B.w / 2.f;
float bmaxx = B.x + B.w / 2.f;
float bminy = B.y - B.h / 2.f;
float bmaxy = B.y + B.h / 2.f;
float w = std::min(amaxx, bmaxx) - std::max(aminx, bminx);
float h = std::min(amaxy, bmaxy) - std::max(aminy, bminy);
if (w <= 0 || h <= 0)
return 0;
return (w * h) / ((A.w * A.h) + (B.w * B.h) - (w * h));
}
void PedestrianDetection::nms(const std::vector<de_result> &vpbox, std::vector<de_result> &voutbox, float iou_thres)
{
std::vector<float> flag(vpbox.size(), -1.0);
int n = vpbox.size();
for (int i = 0; i < n; i++)
{
flag[i] = vpbox[i].index;
}
for (int i = 0; i < n; i++)
{
if (flag[i] == -1.0)
continue;
for (int j = i + 1; j < n; j++)
{
if(flag[j] == -1.0 || vpbox[i].index != vpbox[j].index)
continue;
float iou = clac_iou(vpbox[i], vpbox[j]);
if (iou > iou_thres)
{
flag[j] = -1.0;
}
}
}
for (int i = 0; i < n; i++)
{
if (flag[i] != -1.0)
{
voutbox.push_back(vpbox[i]);
}
}
}
void PedestrianDetection::decode_one_with_thre(int layer, float *anchors, void *data, const int xno, const int yno, const int anchor_no, std::vector<de_result>& boxes, cv::Mat &img)
{
const float *ori_ptr = (const float *)data;
float decode_data[length], *data_ptr;
data_ptr = decode_data;
data_ptr[4] = sigmoid(ori_ptr[4]);
for(int k = 5; k < length; k++)
{
data_ptr[k] = sigmoid(ori_ptr[k]);
}
int max_index = find_max_index(data_ptr+5, kind_nums);
if(data_ptr[max_index+5] * data_ptr[4] < conf_thresh_)
return;
data_ptr[0] = sigmoid(ori_ptr[0]);
data_ptr[1] = sigmoid(ori_ptr[1]);
data_ptr[2] = expf(ori_ptr[2]);
data_ptr[3] = expf(ori_ptr[3]);
de_result tmp;
tmp.x = (data_ptr[0] + xno) / yolov4tnsize[2 * layer + 1] * img.cols;
tmp.y = (data_ptr[1] + yno) / yolov4tnsize[2 * layer] * img.rows;
tmp.w = (((data_ptr[2]) * anchors[2 * anchor_no])) / yolov4tnsize[2 * layer + 1] * img.cols;
tmp.h = (((data_ptr[3]) * anchors[2 * anchor_no + 1])) / yolov4tnsize[2 * layer] * img.rows;
tmp.conf = data_ptr[4];
tmp.index = max_index;
tmp.prob = data_ptr[max_index+5];
tmp.complex_prob = data_ptr[max_index+5] * data_ptr[4];
// printf("## layer[%d] prob : %f , x : %f , y : %f , w : %f , h : %f \n", layer, tmp.prob, tmp.x, tmp.y, tmp.w, tmp.h);
boxes.push_back(tmp);
}
void PedestrianDetection::process_decode(float* input, std::vector<de_result> &res, cv::Mat &img)
{
std::vector<de_result> boxes;
float* data_ptr = input;
for(int i = 0; i < yolov4tnsize[0]; i++){
for(int j = 0; j < yolov4tnsize[1]; j++){
for(int k = 0; k < 3; k++)
{
decode_one_with_thre(0, anchors_0, data_ptr, j, i, k, boxes, img);
data_ptr += length;
}
}
}
for(int i = 0; i < yolov4tnsize[2]; i++){
for(int j = 0; j < yolov4tnsize[3]; j++){
for(int k = 0; k < 3; k++)
{
decode_one_with_thre(1, anchors_1, data_ptr, j, i, k, boxes, img);
data_ptr += length;
}
}
}
sort(boxes.begin(), boxes.end(), comp_prob);
nms(boxes, res, nms_thresh_);
boxes.clear();
}
void PedestrianDetection::draw_objects(cv::Mat &image, const std::vector<de_result> &objects)
{
double font_size = 0.6;
int font_bold = 1;
int baseLine = 0;
for(size_t i = 0; i < objects.size(); i++)
{
const de_result &obj = objects[i];
sprintf(text, "%s %.1f%%", label_name[obj.index].c_str(), obj.complex_prob * 100);
int x = std::max(static_cast<int>(obj.x - obj.w / 2), 0);
int y = std::max(static_cast<int>(obj.y - obj.h / 2), 0);
int x1 = std::min(static_cast<int>(obj.x + obj.w / 2), image.cols);
int y1 = std::min(static_cast<int>(obj.y + obj.h / 2), image.rows);
int w = x1 - x - 1;
int h = y1 - y - 1;
cv::Rect obj_rect(x, y, w, h);
cv::rectangle(image, obj_rect, cv::Scalar(0, 255, 0), font_bold);
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, font_size, font_bold, &baseLine);
int tx = obj_rect.x;
int ty = obj_rect.y - label_size.height - baseLine;
if (ty < 0)
ty = 0;
cv::Point point;
point = cv::Point(tx, ty + label_size.height);
cv::rectangle(image, cv::Rect(cv::Point(obj_rect.x, ty), cv::Size(obj_rect.width, label_size.height + baseLine)), cv::Scalar(128, 128, 0), CV_FILLED);
cv::putText(image, text, point, cv::FONT_HERSHEY_SIMPLEX, font_size, cv::Scalar(0, 255, 0), font_bold);
}
}
void PedestrianDetection::cut_img(cv::Mat &src, int m, int n, std::vector<cv::Mat> &ceil)
{
int height = src.rows;
int width = src.cols;
int ceil_h = height / m;
int ceil_w = width / n;
cv::Mat roi_img;
for(int i = 0; i < m; i++)
{
for(int j = 0; j < n; j++)
{
cv::Rect rect(j*ceil_w, i*ceil_h, ceil_w, ceil_h);
roi_img = cv::Mat(src, rect);
ceil.push_back(roi_img);
}
}
}