In this tutorial we will compare AKAZE and ORB local features using them to find matches between video frames and track object movements.
The algorithm is as follows:
To do the tracking we need a video and object position on the first frame.
You can download our example video and data from here.
To run the code you have to specify input and output video path and object bounding box.
./planar_tracking blais.mp4 result.avi blais_bb.xml.gz
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | #include <opencv2/features2d.hpp>
#include <opencv2/videoio.hpp>
#include <opencv2/opencv.hpp>
#include <vector>
#include <iostream>
#include <iomanip>
#include "stats.h" // Stats structure definition
#include "utils.h" // Drawing and printing functions
using namespace std;
using namespace cv;
const double akaze_thresh = 3e-4; // AKAZE detection threshold set to locate about 1000 keypoints
const double ransac_thresh = 2.5f; // RANSAC inlier threshold
const double nn_match_ratio = 0.8f; // Nearest-neighbour matching ratio
const int bb_min_inliers = 100; // Minimal number of inliers to draw bounding box
const int stats_update_period = 10; // On-screen statistics are updated every 10 frames
class Tracker
{
public:
Tracker(Ptr<Feature2D> _detector, Ptr<DescriptorMatcher> _matcher) :
detector(_detector),
matcher(_matcher)
{}
void setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats);
Mat process(const Mat frame, Stats& stats);
Ptr<Feature2D> getDetector() {
return detector;
}
protected:
Ptr<Feature2D> detector;
Ptr<DescriptorMatcher> matcher;
Mat first_frame, first_desc;
vector<KeyPoint> first_kp;
vector<Point2f> object_bb;
};
void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
{
first_frame = frame.clone();
detector->detectAndCompute(first_frame, noArray(), first_kp, first_desc);
stats.keypoints = (int)first_kp.size();
drawBoundingBox(first_frame, bb);
putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
object_bb = bb;
}
Mat Tracker::process(const Mat frame, Stats& stats)
{
vector<KeyPoint> kp;
Mat desc;
detector->detectAndCompute(frame, noArray(), kp, desc);
stats.keypoints = (int)kp.size();
vector< vector<DMatch> > matches;
vector<KeyPoint> matched1, matched2;
matcher->knnMatch(first_desc, desc, matches, 2);
for(unsigned i = 0; i < matches.size(); i++) {
if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
matched1.push_back(first_kp[matches[i][0].queryIdx]);
matched2.push_back( kp[matches[i][0].trainIdx]);
}
}
stats.matches = (int)matched1.size();
Mat inlier_mask, homography;
vector<KeyPoint> inliers1, inliers2;
vector<DMatch> inlier_matches;
if(matched1.size() >= 4) {
homography = findHomography(Points(matched1), Points(matched2),
RANSAC, ransac_thresh, inlier_mask);
}
if(matched1.size() < 4 || homography.empty()) {
Mat res;
hconcat(first_frame, frame, res);
stats.inliers = 0;
stats.ratio = 0;
return res;
}
for(unsigned i = 0; i < matched1.size(); i++) {
if(inlier_mask.at<uchar>(i)) {
int new_i = static_cast<int>(inliers1.size());
inliers1.push_back(matched1[i]);
inliers2.push_back(matched2[i]);
inlier_matches.push_back(DMatch(new_i, new_i, 0));
}
}
stats.inliers = (int)inliers1.size();
stats.ratio = stats.inliers * 1.0 / stats.matches;
vector<Point2f> new_bb;
perspectiveTransform(object_bb, new_bb, homography);
Mat frame_with_bb = frame.clone();
if(stats.inliers >= bb_min_inliers) {
drawBoundingBox(frame_with_bb, new_bb);
}
Mat res;
drawMatches(first_frame, inliers1, frame_with_bb, inliers2,
inlier_matches, res,
Scalar(255, 0, 0), Scalar(255, 0, 0));
return res;
}
int main(int argc, char **argv)
{
if(argc < 4) {
cerr << "Usage: " << endl <<
"akaze_track input_path output_path bounding_box" << endl;
return 1;
}
VideoCapture video_in(argv[1]);
VideoWriter video_out(argv[2],
(int)video_in.get(CAP_PROP_FOURCC),
(int)video_in.get(CAP_PROP_FPS),
Size(2 * (int)video_in.get(CAP_PROP_FRAME_WIDTH),
2 * (int)video_in.get(CAP_PROP_FRAME_HEIGHT)));
if(!video_in.isOpened()) {
cerr << "Couldn't open " << argv[1] << endl;
return 1;
}
if(!video_out.isOpened()) {
cerr << "Couldn't open " << argv[2] << endl;
return 1;
}
vector<Point2f> bb;
FileStorage fs(argv[3], FileStorage::READ);
if(fs["bounding_box"].empty()) {
cerr << "Couldn't read bounding_box from " << argv[3] << endl;
return 1;
}
fs["bounding_box"] >> bb;
Stats stats, akaze_stats, orb_stats;
Ptr<AKAZE> akaze = AKAZE::create();
akaze->set("threshold", akaze_thresh);
Ptr<ORB> orb = ORB::create();
orb->setMaxFeatures(stats.keypoints);
Ptr<DescriptorMatcher> matcher = DescriptorMatcher::create("BruteForce-Hamming");
Tracker akaze_tracker(akaze, matcher);
Tracker orb_tracker(orb, matcher);
Mat frame;
video_in >> frame;
akaze_tracker.setFirstFrame(frame, bb, "AKAZE", stats);
orb_tracker.setFirstFrame(frame, bb, "ORB", stats);
Stats akaze_draw_stats, orb_draw_stats;
int frame_count = (int)video_in.get(CAP_PROP_FRAME_COUNT);
Mat akaze_res, orb_res, res_frame;
for(int i = 1; i < frame_count; i++) {
bool update_stats = (i % stats_update_period == 0);
video_in >> frame;
akaze_res = akaze_tracker.process(frame, stats);
akaze_stats += stats;
if(update_stats) {
akaze_draw_stats = stats;
}
orb_tracker.getDetector()->set("nFeatures", stats.keypoints);
orb_res = orb_tracker.process(frame, stats);
orb_stats += stats;
if(update_stats) {
orb_draw_stats = stats;
}
drawStatistics(akaze_res, akaze_draw_stats);
drawStatistics(orb_res, orb_draw_stats);
vconcat(akaze_res, orb_res, res_frame);
video_out << res_frame;
cout << i << "/" << frame_count - 1 << endl;
}
akaze_stats /= frame_count - 1;
orb_stats /= frame_count - 1;
printStatistics("AKAZE", akaze_stats);
printStatistics("ORB", orb_stats);
return 0;
}
|
This class implements algorithm described abobve using given feature detector and descriptor matcher.
Setting up the first frame
void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
{
first_frame = frame.clone();
(*detector)(first_frame, noArray(), first_kp, first_desc);
stats.keypoints = (int)first_kp.size();
drawBoundingBox(first_frame, bb);
putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
object_bb = bb;
}
We compute and store keypoints and descriptors from the first frame and prepare it for the output.
We need to save number of detected keypoints to make sure both detectors locate roughly the same number of those.
Processing frames
(*detector)(frame, noArray(), kp, desc);To find matches between frames we have to locate the keypoints first.
In this tutorial detectors are set up to find about 1000 keypoints on each frame.
matcher->knnMatch(first_desc, desc, matches, 2); for(unsigned i = 0; i < matches.size(); i++) { if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) { matched1.push_back(first_kp[matches[i][0].queryIdx]); matched2.push_back( kp[matches[i][0].trainIdx]); } }If the closest match is nn_match_ratio closer than the second closest one, then it’s a match.
homography = findHomography(Points(matched1), Points(matched2), RANSAC, ransac_thresh, inlier_mask);If there are at least 4 matches we can use random sample consensus to estimate image transformation.
for(unsigned i = 0; i < matched1.size(); i++) { if(inlier_mask.at<uchar>(i)) { int new_i = static_cast<int>(inliers1.size()); inliers1.push_back(matched1[i]); inliers2.push_back(matched2[i]); inlier_matches.push_back(DMatch(new_i, new_i, 0)); } }Since findHomography computes the inliers we only have to save the chosen points and matches.
perspectiveTransform(object_bb, new_bb, homography);If there is a reasonable number of inliers we can use estimated transformation to locate the object.
You can watch the resulting video on youtube.
AKAZE statistics:
Matches 626 Inliers 410 Inlier ratio 0.58 Keypoints 1117
ORB statistics:
Matches 504 Inliers 319 Inlier ratio 0.56 Keypoints 1112