OpenCV  3.4.0
Open Source Computer Vision
YOLO DNNs

Introduction

In this text you will learn how to use opencv_dnn module using yolo_object_detection (Sample of using OpenCV dnn module in real time with device capture, video and image).

We will demonstrate results of this example on the following picture.

yolo.jpg
Picture example

Examples

VIDEO DEMO:

Source Code

The latest version of sample source code can be downloaded here.

// Brief Sample of using OpenCV dnn module in real time with device capture, video and image.
// VIDEO DEMO: https://www.youtube.com/watch?v=NHtRlndE2cg
#include <opencv2/dnn.hpp>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <cstdlib>
using namespace std;
using namespace cv;
using namespace cv::dnn;
static const char* about =
"This sample uses You only look once (YOLO)-Detector (https://arxiv.org/abs/1612.08242) to detect objects on camera/video/image.\n"
"Models can be downloaded here: https://pjreddie.com/darknet/yolo/\n"
"Default network is 416x416.\n"
"Class names can be downloaded here: https://github.com/pjreddie/darknet/tree/master/data\n";
static const char* params =
"{ help | false | print usage }"
"{ cfg | | model configuration }"
"{ model | | model weights }"
"{ camera_device | 0 | camera device number}"
"{ source | | video or image for detection}"
"{ min_confidence | 0.24 | min confidence }"
"{ class_names | | File with class names, [PATH-TO-DARKNET]/data/coco.names }";
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, params);
if (parser.get<bool>("help"))
{
cout << about << endl;
parser.printMessage();
return 0;
}
String modelConfiguration = parser.get<String>("cfg");
String modelBinary = parser.get<String>("model");
dnn::Net net = readNetFromDarknet(modelConfiguration, modelBinary);
if (net.empty())
{
cerr << "Can't load network by using the following files: " << endl;
cerr << "cfg-file: " << modelConfiguration << endl;
cerr << "weights-file: " << modelBinary << endl;
cerr << "Models can be downloaded here:" << endl;
cerr << "https://pjreddie.com/darknet/yolo/" << endl;
exit(-1);
}
if (parser.get<String>("source").empty())
{
int cameraDevice = parser.get<int>("camera_device");
cap = VideoCapture(cameraDevice);
if(!cap.isOpened())
{
cout << "Couldn't find camera: " << cameraDevice << endl;
return -1;
}
}
else
{
cap.open(parser.get<String>("source"));
if(!cap.isOpened())
{
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
return -1;
}
}
vector<string> classNamesVec;
ifstream classNamesFile(parser.get<String>("class_names").c_str());
if (classNamesFile.is_open())
{
string className = "";
while (std::getline(classNamesFile, className))
classNamesVec.push_back(className);
}
for(;;)
{
Mat frame;
cap >> frame; // get a new frame from camera/video or read image
if (frame.empty())
{
break;
}
if (frame.channels() == 4)
cvtColor(frame, frame, COLOR_BGRA2BGR);
Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false); //Convert Mat to batch of images
net.setInput(inputBlob, "data"); //set the network input
Mat detectionMat = net.forward("detection_out"); //compute output
vector<double> layersTimings;
double freq = getTickFrequency() / 1000;
double time = net.getPerfProfile(layersTimings) / freq;
ostringstream ss;
ss << "FPS: " << 1000/time << " ; time: " << time << " ms";
putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255));
float confidenceThreshold = parser.get<float>("min_confidence");
for (int i = 0; i < detectionMat.rows; i++)
{
const int probability_index = 5;
const int probability_size = detectionMat.cols - probability_index;
float *prob_array_ptr = &detectionMat.at<float>(i, probability_index);
size_t objectClass = max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
float confidence = detectionMat.at<float>(i, (int)objectClass + probability_index);
if (confidence > confidenceThreshold)
{
float x = detectionMat.at<float>(i, 0);
float y = detectionMat.at<float>(i, 1);
float width = detectionMat.at<float>(i, 2);
float height = detectionMat.at<float>(i, 3);
int xLeftBottom = static_cast<int>((x - width / 2) * frame.cols);
int yLeftBottom = static_cast<int>((y - height / 2) * frame.rows);
int xRightTop = static_cast<int>((x + width / 2) * frame.cols);
int yRightTop = static_cast<int>((y + height / 2) * frame.rows);
Rect object(xLeftBottom, yLeftBottom,
xRightTop - xLeftBottom,
yRightTop - yLeftBottom);
rectangle(frame, object, Scalar(0, 255, 0));
if (objectClass < classNamesVec.size())
{
ss.str("");
ss << confidence;
String conf(ss.str());
String label = String(classNamesVec[objectClass]) + ": " + conf;
int baseLine = 0;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom ),
Size(labelSize.width, labelSize.height + baseLine)),
Scalar(255, 255, 255), CV_FILLED);
putText(frame, label, Point(xLeftBottom, yLeftBottom+labelSize.height),
}
else
{
cout << "Class: " << objectClass << endl;
cout << "Confidence: " << confidence << endl;
cout << " " << xLeftBottom
<< " " << yLeftBottom
<< " " << xRightTop
<< " " << yRightTop << endl;
}
}
}
imshow("YOLO: Detections", frame);
if (waitKey(1) >= 0) break;
}
return 0;
} // main

How to compile in command line with pkg-config

# g++ `pkg-config --cflags opencv` `pkg-config --libs opencv` yolo_object_detection.cpp -o yolo_object_detection

Execute in webcam:

$ yolo_object_detection -camera_device=0 -cfg=[PATH-TO-DARKNET]/cfg/yolo.cfg -model=[PATH-TO-DARKNET]/yolo.weights -class_names=[PATH-TO-DARKNET]/data/coco.names

Execute with image:

$ yolo_object_detection -source=[PATH-IMAGE] -cfg=[PATH-TO-DARKNET]/cfg/yolo.cfg -model=[PATH-TO-DARKNET]/yolo.weights -class_names=[PATH-TO-DARKNET]/data/coco.names

Execute in video file:

$ yolo_object_detection -source=[PATH-TO-VIDEO] -cfg=[PATH-TO-DARKNET]/cfg/yolo.cfg -model=[PATH-TO-DARKNET]/yolo.weights -class_names=[PATH-TO-DARKNET]/data/coco.names

Questions and suggestions email to: Alessandro de Oliveira Faria cabel.nosp@m.o@op.nosp@m.ensus.nosp@m.e.or.nosp@m.g or OpenCV Team.