From 7513cf7964526a4b9b7699323ef2b59229b9531a Mon Sep 17 00:00:00 2001 From: Tadas Baltrusaitis Date: Sat, 29 Jul 2017 21:11:16 -0400 Subject: [PATCH] Speeding up fancy face validation. --- .../include/LandmarkDetectionValidator.h | 3 + .../include/LandmarkDetectorModel.h | 1 + .../src/LandmarkDetectionValidator.cpp | 295 +++++++++++++++++- .../src/LandmarkDetectorFunc.cpp | 25 +- .../src/LandmarkDetectorModel.cpp | 8 +- 5 files changed, 329 insertions(+), 3 deletions(-) diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectionValidator.h b/lib/local/LandmarkDetector/include/LandmarkDetectionValidator.h index 4c7c3a2..e6e590d 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectionValidator.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectionValidator.h @@ -162,6 +162,9 @@ private: // Feed-forward Neural Network double CheckNN(const cv::Mat_& warped_img, int view_id); + // Convolutional Neural Network + double CheckCNN_tbb(const cv::Mat_& warped_img, int view_id); + // Convolutional Neural Network double CheckCNN(const cv::Mat_& warped_img, int view_id); diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h index be63769..39d8db1 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h @@ -124,6 +124,7 @@ public: // Keeping track of how many frames the tracker has failed in so far when tracking in videos // This is useful for knowing when to initialise and reinitialise tracking int failures_in_a_row; + int success_in_a_row; // A template of a face that last succeeded with tracking (useful for large motions in video) cv::Mat_ face_template; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectionValidator.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectionValidator.cpp index 00d7c11..10d76cd 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectionValidator.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectionValidator.cpp @@ -64,6 +64,9 @@ #include #include +// TBB includes +#include + // System includes #include @@ -481,7 +484,9 @@ double DetectionValidator::Check(const cv::Vec3d& orientation, const cv::Mat_& warped_img, int return dec; } +// Convolutional Neural Network +double DetectionValidator::CheckCNN_tbb(const cv::Mat_& warped_img, int view_id) +{ + + cv::Mat_ feature_vec; + NormaliseWarpedToVector(warped_img, feature_vec, view_id); + + // Create a normalised image from the crop vector + cv::Mat_ img(warped_img.size(), 0.0); + img = img.t(); + + cv::Mat mask = paws[view_id].pixel_mask.t(); + cv::MatIterator_ mask_it = mask.begin(); + + cv::MatIterator_ feature_it = feature_vec.begin(); + cv::MatIterator_ img_it = img.begin(); + + int wInt = img.cols; + int hInt = img.rows; + + for (int i = 0; i < wInt; ++i) + { + for (int j = 0; j < hInt; ++j, ++mask_it, ++img_it) + { + // if is within mask + if (*mask_it) + { + // assign the feature to image if it is within the mask + *img_it = (float)*feature_it++; + } + } + } + img = img.t(); + + int cnn_layer = 0; + int fully_connected_layer = 0; + + vector > input_maps; + input_maps.push_back(img); + + vector > outputs; + + for (size_t layer = 0; layer < cnn_layer_types[view_id].size(); ++layer) + { + // Determine layer type + int layer_type = cnn_layer_types[view_id][layer]; + + // Convolutional layer + if (layer_type == 0) + { + outputs.clear(); + // Pre-allocate the output feature maps + outputs.resize(cnn_convolutional_layers[view_id][cnn_layer][0].size()); + for (size_t in = 0; in < input_maps.size(); ++in) + { + cv::Mat_ input_image = input_maps[in]; + + // Useful precomputed data placeholders for quick correlation (convolution) + cv::Mat_ input_image_dft; + cv::Mat integral_image; + cv::Mat integral_image_sq; + + // To adapt for TBB, perform the first convolution in a non TBB way so that dft, and integral images are computed + cv::Mat_ kernel = cnn_convolutional_layers[view_id][cnn_layer][in][0]; + + // The convolution (with precomputation) + cv::Mat_ output; + if (cnn_convolutional_layers_dft[view_id][cnn_layer][in][0].second.empty()) // This will only be needed during the first pass + { + std::map > precomputed_dft; + + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + + cnn_convolutional_layers_dft[view_id][cnn_layer][in][0].first = precomputed_dft.begin()->first; + cnn_convolutional_layers_dft[view_id][cnn_layer][in][0].second = precomputed_dft.begin()->second; + } + else + { + std::map > precomputed_dft; + precomputed_dft[cnn_convolutional_layers_dft[view_id][cnn_layer][in][0].first] = cnn_convolutional_layers_dft[view_id][cnn_layer][in][0].second; + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + } + + // Combining the maps + if (in == 0) + { + outputs[0] = output; + } + else + { + outputs[0] = outputs[0] + output; + } + + if(cnn_convolutional_layers[view_id][cnn_layer][0].size() > 20) + { + // TBB pass for the remaining kernels, empirically helps with layers with more kernels + tbb::parallel_for(1, (int)cnn_convolutional_layers[view_id][cnn_layer][in].size(), [&](int k) { + { + cv::Mat_ kernel = cnn_convolutional_layers[view_id][cnn_layer][in][k]; + + // The convolution (with precomputation) + cv::Mat_ output; + if (cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].second.empty()) // This will only be needed during the first pass + { + std::map > precomputed_dft; + + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + + cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].first = precomputed_dft.begin()->first; + cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].second = precomputed_dft.begin()->second; + } + else + { + std::map > precomputed_dft; + precomputed_dft[cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].first] = cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].second; + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + } + + // Combining the maps + if (in == 0) + { + outputs[k] = output; + } + else + { + outputs[k] = outputs[k] + output; + } + } + }); + } + else + { + for (size_t k = 1; k < cnn_convolutional_layers[view_id][cnn_layer][in].size(); ++k) + { + cv::Mat_ kernel = cnn_convolutional_layers[view_id][cnn_layer][in][k]; + + // The convolution (with precomputation) + cv::Mat_ output; + if (cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].second.empty()) // This will only be needed during the first pass + { + std::map > precomputed_dft; + + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + + cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].first = precomputed_dft.begin()->first; + cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].second = precomputed_dft.begin()->second; + } + else + { + std::map > precomputed_dft; + precomputed_dft[cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].first] = cnn_convolutional_layers_dft[view_id][cnn_layer][in][k].second; + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + } + + // Combining the maps + if (in == 0) + { + outputs[k] = output; + } + else + { + outputs[k] = outputs[k] + output; + } + } + } + } + + for (size_t k = 0; k < cnn_convolutional_layers[view_id][cnn_layer][0].size(); ++k) + { + outputs[k] = outputs[k] + cnn_convolutional_layers_bias[view_id][cnn_layer][k]; + } + cnn_layer++; + } + if (layer_type == 1) + { + vector> outputs_sub; + + // Iterate over pool height and width, all the stride is 2x2 and no padding is used + int stride_x = 2; + int stride_y = 2; + + int pool_x = 2; + int pool_y = 2; + + for (size_t in = 0; in < input_maps.size(); ++in) + { + int out_x = input_maps[in].cols / stride_x; + int out_y = input_maps[in].rows / stride_y; + + cv::Mat_ sub_out(out_y, out_x, 0.0); + cv::Mat_ in_map = input_maps[in]; + + for (int x = 0; x < input_maps[in].cols; x += stride_x) + { + for (int y = 0; y < input_maps[in].rows; y += stride_y) + { + float curr_max = -FLT_MAX; + for (int x_in = x; x_in < x + pool_x; ++x_in) + { + for (int y_in = y; y_in < y + pool_y; ++y_in) + { + float curr_val = in_map.at(y_in, x_in); + if (curr_val > curr_max) + { + curr_max = curr_val; + } + } + } + int x_in_out = x / stride_x; + int y_in_out = y / stride_y; + sub_out.at(y_in_out, x_in_out) = curr_max; + } + } + + outputs_sub.push_back(sub_out); + + } + outputs = outputs_sub; + } + if (layer_type == 2) + { + // Concatenate all the maps + cv::Mat_ input_concat = input_maps[0].t(); + input_concat = input_concat.reshape(0, 1); + + for (size_t in = 1; in < input_maps.size(); ++in) + { + cv::Mat_ add = input_maps[in].t(); + add = add.reshape(0, 1); + cv::hconcat(input_concat, add, input_concat); + } + + input_concat = input_concat * cnn_fully_connected_layers_weights[view_id][fully_connected_layer]; + input_concat = input_concat + cnn_fully_connected_layers_biases[view_id][fully_connected_layer].t(); + + outputs.clear(); + outputs.push_back(input_concat); + + fully_connected_layer++; + } + if (layer_type == 3) // ReLU + { + outputs.clear(); + for (size_t k = 0; k < input_maps.size(); ++k) + { + // Apply the ReLU + cv::threshold(input_maps[k], input_maps[k], 0, 0, cv::THRESH_TOZERO); + outputs.push_back(input_maps[k]); + + } + } + if (layer_type == 4) + { + outputs.clear(); + for (size_t k = 0; k < input_maps.size(); ++k) + { + // Apply the sigmoid + cv::exp(-input_maps[k], input_maps[k]); + input_maps[k] = 1.0 / (1.0 + input_maps[k]); + + outputs.push_back(input_maps[k]); + + } + } + // Set the outputs of this layer to inputs of the next + input_maps = outputs; + + } + + // First turn to the 0-3 range + double max_val = 0; + cv::Point max_loc; + cv::minMaxLoc(outputs[0].t(), 0, &max_val, 0, &max_loc); + int max_idx = max_loc.y; + double max = 3; + double min = 0; + double bins = (double)outputs[0].cols; + // Unquantizing the softmax layer to continuous value + double step_size = (max - min) / bins; // This should be saved somewhere + double unquantized = min + step_size / 2.0 + max_idx * step_size; + + // Turn it to -1, 1 range + double dec = (unquantized - 1.5) / 1.5; + + return dec; +} + // Convolutional Neural Network double DetectionValidator::CheckCNN(const cv::Mat_& warped_img, int view_id) { @@ -849,6 +1141,7 @@ double DetectionValidator::CheckCNN(const cv::Mat_& warped_img, int view cv::Mat integral_image; cv::Mat integral_image_sq; + // TODO can TBB-ify this for (size_t k = 0; k < cnn_convolutional_layers[view_id][cnn_layer][in].size(); ++k) { cv::Mat_ kernel = cnn_convolutional_layers[view_id][cnn_layer][in][k]; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp index f543965..85c1eb3 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp @@ -288,16 +288,36 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i CorrectGlobalParametersVideo(grayscale_image, clnf_model, params); } + + // If we are performing face validation, do it every 3 frames due to performance + bool reset_to_true = false; + double old_certainty = 0; + if (params.validate_detections == true && clnf_model.success_in_a_row % 3 != 0) + { + params.validate_detections = false; + reset_to_true = true; + old_certainty = clnf_model.detection_certainty; + } + bool track_success = clnf_model.DetectLandmarks(grayscale_image, depth_image, params); + + if (reset_to_true) + { + params.validate_detections = true; + clnf_model.detection_certainty = old_certainty; + } + if(!track_success) { // Make a record that tracking failed clnf_model.failures_in_a_row++; + clnf_model.success_in_a_row = 0; } else { // indicate that tracking is a success clnf_model.failures_in_a_row = -1; + clnf_model.success_in_a_row++; UpdateTemplate(grayscale_image, clnf_model); } } @@ -377,7 +397,8 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i } else { - clnf_model.failures_in_a_row = -1; + clnf_model.failures_in_a_row = -1; + clnf_model.success_in_a_row++; UpdateTemplate(grayscale_image, clnf_model); return true; } @@ -388,12 +409,14 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i if(!clnf_model.tracking_initialised) { clnf_model.failures_in_a_row++; + clnf_model.success_in_a_row = 0; } // un-initialise the tracking if( clnf_model.failures_in_a_row > 100) { clnf_model.tracking_initialised = false; + clnf_model.success_in_a_row = 0; } return clnf_model.detection_success; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp index ba15eff..a8e89ff 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp @@ -76,7 +76,8 @@ CLNF::CLNF(const CLNF& other): pdm(other.pdm), params_local(other.params_local.c this->detection_certainty = other.detection_certainty; this->model_likelihood = other.model_likelihood; this->failures_in_a_row = other.failures_in_a_row; - + this->success_in_a_row = other.success_in_a_row; + // Load the CascadeClassifier (as it does not have a proper copy constructor) if(!face_detector_location.empty()) { @@ -121,6 +122,7 @@ CLNF & CLNF::operator= (const CLNF& other) this->detection_certainty = other.detection_certainty; this->model_likelihood = other.model_likelihood; this->failures_in_a_row = other.failures_in_a_row; + this->success_in_a_row = other.success_in_a_row; this->eye_model = other.eye_model; @@ -164,6 +166,7 @@ CLNF::CLNF(const CLNF&& other) this->detection_certainty = other.detection_certainty; this->model_likelihood = other.model_likelihood; this->failures_in_a_row = other.failures_in_a_row; + this->success_in_a_row = other.success_in_a_row; pdm = other.pdm; params_local = other.params_local; @@ -199,6 +202,7 @@ CLNF & CLNF::operator= (const CLNF&& other) this->detection_certainty = other.detection_certainty; this->model_likelihood = other.model_likelihood; this->failures_in_a_row = other.failures_in_a_row; + this->success_in_a_row = other.success_in_a_row; pdm = other.pdm; params_local = other.params_local; @@ -527,6 +531,7 @@ void CLNF::Read(string main_location) params_global = cv::Vec6d(1, 0, 0, 0, 0, 0); failures_in_a_row = -1; + success_in_a_row = 0; } @@ -547,6 +552,7 @@ void CLNF::Reset() params_global = cv::Vec6d(1, 0, 0, 0, 0, 0); failures_in_a_row = -1; + success_in_a_row = 0; face_template = cv::Mat_(); }