Table Of Contents

Previous topic

AKAZE local features matching

Next topic

video module. Video analysis

AKAZE and ORB planar tracking

Introduction

In this tutorial we will compare AKAZE and ORB local features using them to find matches between video frames and track object movements.

The algorithm is as follows:

  • Detect and describe keypoints on the first frame, manually set object boundaries
  • For every next frame:
    1. Detect and describe keypoints
    2. Match them using bruteforce matcher
    3. Estimate homography transformation using RANSAC
    4. Filter inliers from all the matches
    5. Apply homography transformation to the bounding box to find the object
    6. Draw bounding box and inliers, compute inlier ratio as evaluation metric
Result frame example

Data

To do the tracking we need a video and object position on the first frame.

You can download our example video and data from here.

To run the code you have to specify input and output video path and object bounding box.

./planar_tracking blais.mp4 result.avi blais_bb.xml.gz

Source Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#include <opencv2/features2d.hpp>
#include <opencv2/videoio.hpp>
#include <opencv2/opencv.hpp>
#include <vector>
#include <iostream>
#include <iomanip>

#include "stats.h" // Stats structure definition
#include "utils.h" // Drawing and printing functions

using namespace std;
using namespace cv;

const double akaze_thresh = 3e-4; // AKAZE detection threshold set to locate about 1000 keypoints
const double ransac_thresh = 2.5f; // RANSAC inlier threshold
const double nn_match_ratio = 0.8f; // Nearest-neighbour matching ratio
const int bb_min_inliers = 100; // Minimal number of inliers to draw bounding box
const int stats_update_period = 10; // On-screen statistics are updated every 10 frames

class Tracker
{
public:
    Tracker(Ptr<Feature2D> _detector, Ptr<DescriptorMatcher> _matcher) :
        detector(_detector),
        matcher(_matcher)
    {}

    void setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats);
    Mat process(const Mat frame, Stats& stats);
    Ptr<Feature2D> getDetector() {
        return detector;
    }
protected:
    Ptr<Feature2D> detector;
    Ptr<DescriptorMatcher> matcher;
    Mat first_frame, first_desc;
    vector<KeyPoint> first_kp;
    vector<Point2f> object_bb;
};

void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
{
    first_frame = frame.clone();
    detector->detectAndCompute(first_frame, noArray(), first_kp, first_desc);
    stats.keypoints = (int)first_kp.size();
    drawBoundingBox(first_frame, bb);
    putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
    object_bb = bb;
}

Mat Tracker::process(const Mat frame, Stats& stats)
{
    vector<KeyPoint> kp;
    Mat desc;
    detector->detectAndCompute(frame, noArray(), kp, desc);
    stats.keypoints = (int)kp.size();

    vector< vector<DMatch> > matches;
    vector<KeyPoint> matched1, matched2;
    matcher->knnMatch(first_desc, desc, matches, 2);
    for(unsigned i = 0; i < matches.size(); i++) {
        if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
            matched1.push_back(first_kp[matches[i][0].queryIdx]);
            matched2.push_back(      kp[matches[i][0].trainIdx]);
        }
    }
    stats.matches = (int)matched1.size();

    Mat inlier_mask, homography;
    vector<KeyPoint> inliers1, inliers2;
    vector<DMatch> inlier_matches;
    if(matched1.size() >= 4) {
        homography = findHomography(Points(matched1), Points(matched2),
                                    RANSAC, ransac_thresh, inlier_mask);
    }

    if(matched1.size() < 4 || homography.empty()) {
        Mat res;
        hconcat(first_frame, frame, res);
        stats.inliers = 0;
        stats.ratio = 0;
        return res;
    }
    for(unsigned i = 0; i < matched1.size(); i++) {
        if(inlier_mask.at<uchar>(i)) {
            int new_i = static_cast<int>(inliers1.size());
            inliers1.push_back(matched1[i]);
            inliers2.push_back(matched2[i]);
            inlier_matches.push_back(DMatch(new_i, new_i, 0));
        }
    }
    stats.inliers = (int)inliers1.size();
    stats.ratio = stats.inliers * 1.0 / stats.matches;

    vector<Point2f> new_bb;
    perspectiveTransform(object_bb, new_bb, homography);
    Mat frame_with_bb = frame.clone();
    if(stats.inliers >= bb_min_inliers) {
        drawBoundingBox(frame_with_bb, new_bb);
    }
    Mat res;
    drawMatches(first_frame, inliers1, frame_with_bb, inliers2,
                inlier_matches, res,
                Scalar(255, 0, 0), Scalar(255, 0, 0));
    return res;
}

int main(int argc, char **argv)
{
    if(argc < 4) {
        cerr << "Usage: " << endl <<
                "akaze_track input_path output_path bounding_box" << endl;
        return 1;
    }
    VideoCapture video_in(argv[1]);
    VideoWriter  video_out(argv[2],
                           (int)video_in.get(CAP_PROP_FOURCC),
                           (int)video_in.get(CAP_PROP_FPS),
                           Size(2 * (int)video_in.get(CAP_PROP_FRAME_WIDTH),
                                2 * (int)video_in.get(CAP_PROP_FRAME_HEIGHT)));

    if(!video_in.isOpened()) {
        cerr << "Couldn't open " << argv[1] << endl;
        return 1;
    }
    if(!video_out.isOpened()) {
        cerr << "Couldn't open " << argv[2] << endl;
        return 1;
    }

    vector<Point2f> bb;
    FileStorage fs(argv[3], FileStorage::READ);
    if(fs["bounding_box"].empty()) {
        cerr << "Couldn't read bounding_box from " << argv[3] << endl;
        return 1;
    }
    fs["bounding_box"] >> bb;

    Stats stats, akaze_stats, orb_stats;
    Ptr<AKAZE> akaze = AKAZE::create();
    akaze->set("threshold", akaze_thresh);
    Ptr<ORB> orb = ORB::create();
    orb->setMaxFeatures(stats.keypoints);
    Ptr<DescriptorMatcher> matcher = DescriptorMatcher::create("BruteForce-Hamming");
    Tracker akaze_tracker(akaze, matcher);
    Tracker orb_tracker(orb, matcher);

    Mat frame;
    video_in >> frame;
    akaze_tracker.setFirstFrame(frame, bb, "AKAZE", stats);
    orb_tracker.setFirstFrame(frame, bb, "ORB", stats);

    Stats akaze_draw_stats, orb_draw_stats;
    int frame_count = (int)video_in.get(CAP_PROP_FRAME_COUNT);
    Mat akaze_res, orb_res, res_frame;
    for(int i = 1; i < frame_count; i++) {
        bool update_stats = (i % stats_update_period == 0);
        video_in >> frame;

        akaze_res = akaze_tracker.process(frame, stats);
        akaze_stats += stats;
        if(update_stats) {
            akaze_draw_stats = stats;
        }

        orb_tracker.getDetector()->set("nFeatures", stats.keypoints);
        orb_res = orb_tracker.process(frame, stats);
        orb_stats += stats;
        if(update_stats) {
            orb_draw_stats = stats;
        }

        drawStatistics(akaze_res, akaze_draw_stats);
        drawStatistics(orb_res, orb_draw_stats);
        vconcat(akaze_res, orb_res, res_frame);
        video_out << res_frame;
        cout << i << "/" << frame_count - 1 << endl;
    }
    akaze_stats /= frame_count - 1;
    orb_stats /= frame_count - 1;
    printStatistics("AKAZE", akaze_stats);
    printStatistics("ORB", orb_stats);
    return 0;
}

Explanation

Tracker class

This class implements algorithm described abobve using given feature detector and descriptor matcher.
  • Setting up the first frame

    void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
    {
        first_frame = frame.clone();
        (*detector)(first_frame, noArray(), first_kp, first_desc);
        stats.keypoints = (int)first_kp.size();
        drawBoundingBox(first_frame, bb);
        putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
        object_bb = bb;
    }
    

    We compute and store keypoints and descriptors from the first frame and prepare it for the output.

    We need to save number of detected keypoints to make sure both detectors locate roughly the same number of those.

  • Processing frames

    1. Locate keypoints and compute descriptors
    (*detector)(frame, noArray(), kp, desc);
    

    To find matches between frames we have to locate the keypoints first.

    In this tutorial detectors are set up to find about 1000 keypoints on each frame.

    1. Use 2-nn matcher to find correspondences
    matcher->knnMatch(first_desc, desc, matches, 2);
    for(unsigned i = 0; i < matches.size(); i++) {
        if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
            matched1.push_back(first_kp[matches[i][0].queryIdx]);
            matched2.push_back(      kp[matches[i][0].trainIdx]);
        }
    }
    

    If the closest match is nn_match_ratio closer than the second closest one, then it’s a match.

    1. Use RANSAC to estimate homography transformation
    homography = findHomography(Points(matched1), Points(matched2),
                                RANSAC, ransac_thresh, inlier_mask);
    

    If there are at least 4 matches we can use random sample consensus to estimate image transformation.

    1. Save the inliers
    for(unsigned i = 0; i < matched1.size(); i++) {
        if(inlier_mask.at<uchar>(i)) {
            int new_i = static_cast<int>(inliers1.size());
            inliers1.push_back(matched1[i]);
            inliers2.push_back(matched2[i]);
            inlier_matches.push_back(DMatch(new_i, new_i, 0));
        }
    }
    

    Since findHomography computes the inliers we only have to save the chosen points and matches.

    1. Project object bounding box
    perspectiveTransform(object_bb, new_bb, homography);
    

    If there is a reasonable number of inliers we can use estimated transformation to locate the object.

Results

You can watch the resulting video on youtube.

AKAZE statistics:

Matches      626
Inliers      410
Inlier ratio 0.58
Keypoints    1117

ORB statistics:

Matches      504
Inliers      319
Inlier ratio 0.56
Keypoints    1112