OpenCV: DNN-based Face Detection And Recognition

Prev Tutorial: High Level API: TextDetectionModel and TextRecognitionModel

Next Tutorial: Conversion of PyTorch Classification Models and Launch with OpenCV Python


Original Author	Chengrui Wang, Yuantao Feng
Compatibility	OpenCV >= 4.5.4

Introduction

In this section, we introduce cv::FaceDetectorYN class for face detection and cv::FaceRecognizerSF class for face recognition.

Models

There are two models (ONNX format) pre-trained and required for this module:

Face Detection:
- Size: 338KB
- Results on WIDER Face Val set: 0.830(easy), 0.824(medium), 0.708(hard)
Face Recognition
- Size: 36.9MB
- Results:

Database	Accuracy	Threshold (normL2)	Threshold (cosine)
LFW	99.60%	1.128	0.363
CALFW	93.95%	1.149	0.340
CPLFW	91.05%	1.204	0.275
AgeDB-30	94.90%	1.202	0.277
CFP-FP	94.80%	1.253	0.212

Code

C++

Downloadable code: Click here
Code at glance:
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/objdetect.hpp>
#include <iostream>
using namespace cv;
using namespace std;
static
void visualize(Mat& input, int frame, Mat& faces, double fps, int thickness = 2)
{
std::string fpsString = cv::format("FPS : %.2f", (float)fps);
if (frame >= 0)
cout << "Frame " << frame << ", ";
cout << "FPS: " << fpsString << endl;
for (int i = 0; i < faces.rows; i++)
{
// Print results
cout << "Face " << i
<< ", top-left coordinates: (" << faces.at<float>(i, 0) << ", " << faces.at<float>(i, 1) << "), "
<< "box width: " << faces.at<float>(i, 2) << ", box height: " << faces.at<float>(i, 3) << ", "
<< "score: " << cv::format("%.2f", faces.at<float>(i, 14))
<< endl;
// Draw bounding box
rectangle(input, Rect2i(int(faces.at<float>(i, 0)), int(faces.at<float>(i, 1)), int(faces.at<float>(i, 2)), int(faces.at<float>(i, 3))), Scalar(0, 255, 0), thickness);
// Draw landmarks
circle(input, Point2i(int(faces.at<float>(i, 4)), int(faces.at<float>(i, 5))), 2, Scalar(255, 0, 0), thickness);
circle(input, Point2i(int(faces.at<float>(i, 6)), int(faces.at<float>(i, 7))), 2, Scalar(0, 0, 255), thickness);
circle(input, Point2i(int(faces.at<float>(i, 8)), int(faces.at<float>(i, 9))), 2, Scalar(0, 255, 0), thickness);
circle(input, Point2i(int(faces.at<float>(i, 10)), int(faces.at<float>(i, 11))), 2, Scalar(255, 0, 255), thickness);
circle(input, Point2i(int(faces.at<float>(i, 12)), int(faces.at<float>(i, 13))), 2, Scalar(0, 255, 255), thickness);
}
putText(input, fpsString, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
}
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv,
"{help h | | Print this message}"
"{image1 i1 | | Path to the input image1. Omit for detecting through VideoCapture}"
"{image2 i2 | | Path to the input image2. When image1 and image2 parameters given then the program try to find a face on both images and runs face recognition algorithm}"
"{video v | 0 | Path to the input video}"
"{scale sc | 1.0 | Scale factor used to resize input video frames}"
"{fd_model fd | face_detection_yunet_2021dec.onnx| Path to the model. Download yunet.onnx in https://github.com/opencv/opencv_zoo/tree/master/models/face_detection_yunet}"
"{fr_model fr | face_recognition_sface_2021dec.onnx | Path to the face recognition model. Download the model at https://github.com/opencv/opencv_zoo/tree/master/models/face_recognition_sface}"
"{score_threshold | 0.9 | Filter out faces of score < score_threshold}"
"{nms_threshold | 0.3 | Suppress bounding boxes of iou >= nms_threshold}"
"{top_k | 5000 | Keep top_k bounding boxes before NMS}"
"{save s | false | Set true to save results. This flag is invalid when using camera}"
);
if (parser.has("help"))
{
parser.printMessage();
return 0;
}
String fd_modelPath = parser.get<String>("fd_model");
String fr_modelPath = parser.get<String>("fr_model");
float scoreThreshold = parser.get<float>("score_threshold");
float nmsThreshold = parser.get<float>("nms_threshold");
int topK = parser.get<int>("top_k");
bool save = parser.get<bool>("save");
float scale = parser.get<float>("scale");
double cosine_similar_thresh = 0.363;
double l2norm_similar_thresh = 1.128;
// Initialize FaceDetectorYN
Ptr<FaceDetectorYN> detector = FaceDetectorYN::create(fd_modelPath, "", Size(320, 320), scoreThreshold, nmsThreshold, topK);
TickMeter tm;
// If input is an image
if (parser.has("image1"))
{
String input1 = parser.get<String>("image1");
Mat image1 = imread(samples::findFile(input1));
if (image1.empty())
{
std::cerr << "Cannot read image: " << input1 << std::endl;
return 2;
}
int imageWidth = int(image1.cols * scale);
int imageHeight = int(image1.rows * scale);
resize(image1, image1, Size(imageWidth, imageHeight));
tm.start();
// Set input size before inference
detector->setInputSize(image1.size());
Mat faces1;
detector->detect(image1, faces1);
if (faces1.rows < 1)
{
std::cerr << "Cannot find a face in " << input1 << std::endl;
return 1;
}
tm.stop();
// Draw results on the input image
visualize(image1, -1, faces1, tm.getFPS());
// Save results if save is true
if (save)
{
cout << "Saving result.jpg...\n";
imwrite("result.jpg", image1);
}
// Visualize results
imshow("image1", image1);
pollKey(); // handle UI events to show content
if (parser.has("image2"))
{
String input2 = parser.get<String>("image2");
Mat image2 = imread(samples::findFile(input2));
if (image2.empty())
{
std::cerr << "Cannot read image2: " << input2 << std::endl;
return 2;
}
tm.reset();
tm.start();
detector->setInputSize(image2.size());
Mat faces2;
detector->detect(image2, faces2);
if (faces2.rows < 1)
{
std::cerr << "Cannot find a face in " << input2 << std::endl;
return 1;
}
tm.stop();
visualize(image2, -1, faces2, tm.getFPS());
if (save)
{
cout << "Saving result2.jpg...\n";
imwrite("result2.jpg", image2);
}
imshow("image2", image2);
pollKey();
// Initialize FaceRecognizerSF
Ptr<FaceRecognizerSF> faceRecognizer = FaceRecognizerSF::create(fr_modelPath, "");
// Aligning and cropping facial image through the first face of faces detected.
Mat aligned_face1, aligned_face2;
faceRecognizer->alignCrop(image1, faces1.row(0), aligned_face1);
faceRecognizer->alignCrop(image2, faces2.row(0), aligned_face2);
// Run feature extraction with given aligned_face
Mat feature1, feature2;
faceRecognizer->feature(aligned_face1, feature1);
feature1 = feature1.clone();
faceRecognizer->feature(aligned_face2, feature2);
feature2 = feature2.clone();
double cos_score = faceRecognizer->match(feature1, feature2, FaceRecognizerSF::DisType::FR_COSINE);
double L2_score = faceRecognizer->match(feature1, feature2, FaceRecognizerSF::DisType::FR_NORM_L2);
if (cos_score >= cosine_similar_thresh)
{
std::cout << "They have the same identity;";
}
else
{
std::cout << "They have different identities;";
}
std::cout << " Cosine Similarity: " << cos_score << ", threshold: " << cosine_similar_thresh << ". (higher value means higher similarity, max 1.0)\n";
if (L2_score <= l2norm_similar_thresh)
{
std::cout << "They have the same identity;";
}
else
{
std::cout << "They have different identities.";
}
std::cout << " NormL2 Distance: " << L2_score << ", threshold: " << l2norm_similar_thresh << ". (lower value means higher similarity, min 0.0)\n";
}
cout << "Press any key to exit..." << endl;
waitKey(0);
}
else
{
int frameWidth, frameHeight;
VideoCapture capture;
std::string video = parser.get<string>("video");
if (video.size() == 1 && isdigit(video[0]))
capture.open(parser.get<int>("video"));
else
capture.open(samples::findFileOrKeep(video)); // keep GStreamer pipelines
if (capture.isOpened())
{
frameWidth = int(capture.get(CAP_PROP_FRAME_WIDTH) * scale);
frameHeight = int(capture.get(CAP_PROP_FRAME_HEIGHT) * scale);
cout << "Video " << video
<< ": width=" << frameWidth
<< ", height=" << frameHeight
<< endl;
}
else
{
cout << "Could not initialize video capturing: " << video << "\n";
return 1;
}
detector->setInputSize(Size(frameWidth, frameHeight));
cout << "Press 'SPACE' to save frame, any other key to exit..." << endl;
int nFrame = 0;
for (;;)
{
// Get frame
Mat frame;
if (!capture.read(frame))
{
cerr << "Can't grab frame! Stop\n";
break;
}
resize(frame, frame, Size(frameWidth, frameHeight));
// Inference
Mat faces;
tm.start();
detector->detect(frame, faces);
tm.stop();
Mat result = frame.clone();
// Draw results on the input image
visualize(result, nFrame, faces, tm.getFPS());
// Visualize results
imshow("Live", result);
int key = waitKey(1);
bool saveFrame = save;
if (key == ' ')
{
saveFrame = true;
key = 0; // handled
}
if (saveFrame)
{
std::string frame_name = cv::format("frame_%05d.png", nFrame);
std::string result_name = cv::format("result_%05d.jpg", nFrame);
cout << "Saving '" << frame_name << "' and '" << result_name << "' ...\n";
imwrite(frame_name, frame);
imwrite(result_name, result);
}
++nFrame;
if (key > 0)
break;
}
cout << "Processed " << nFrame << " frames" << endl;
}
cout << "Done." << endl;
return 0;
}

Python

Downloadable code: Click here
Code at glance:
import argparse
import numpy as np
import cv2 as cv
def str2bool(v):
if v.lower() in ['on', 'yes', 'true', 'y', 't']:
return True
elif v.lower() in ['off', 'no', 'false', 'n', 'f']:
return False
else:
raise NotImplementedError
parser = argparse.ArgumentParser()
parser.add_argument('--image1', '-i1', type=str, help='Path to the input image1. Omit for detecting on default camera.')
parser.add_argument('--image2', '-i2', type=str, help='Path to the input image2. When image1 and image2 parameters given then the program try to find a face on both images and runs face recognition algorithm.')
parser.add_argument('--video', '-v', type=str, help='Path to the input video.')
parser.add_argument('--scale', '-sc', type=float, default=1.0, help='Scale factor used to resize input video frames.')
parser.add_argument('--face_detection_model', '-fd', type=str, default='face_detection_yunet_2021dec.onnx', help='Path to the face detection model. Download the model at https://github.com/opencv/opencv_zoo/tree/master/models/face_detection_yunet')
parser.add_argument('--face_recognition_model', '-fr', type=str, default='face_recognition_sface_2021dec.onnx', help='Path to the face recognition model. Download the model at https://github.com/opencv/opencv_zoo/tree/master/models/face_recognition_sface')
parser.add_argument('--score_threshold', type=float, default=0.9, help='Filtering out faces of score < score_threshold.')
parser.add_argument('--nms_threshold', type=float, default=0.3, help='Suppress bounding boxes of iou >= nms_threshold.')
parser.add_argument('--top_k', type=int, default=5000, help='Keep top_k bounding boxes before NMS.')
parser.add_argument('--save', '-s', type=str2bool, default=False, help='Set true to save results. This flag is invalid when using camera.')
args = parser.parse_args()
def visualize(input, faces, fps, thickness=2):
if faces[1] is not None:
for idx, face in enumerate(faces[1]):
print('Face {}, top-left coordinates: ({:.0f}, {:.0f}), box width: {:.0f}, box height {:.0f}, score: {:.2f}'.format(idx, face[0], face[1], face[2], face[3], face[-1]))
coords = face[:-1].astype(np.int32)
cv.rectangle(input, (coords[0], coords[1]), (coords[0]+coords[2], coords[1]+coords[3]), (0, 255, 0), thickness)
cv.circle(input, (coords[4], coords[5]), 2, (255, 0, 0), thickness)
cv.circle(input, (coords[6], coords[7]), 2, (0, 0, 255), thickness)
cv.circle(input, (coords[8], coords[9]), 2, (0, 255, 0), thickness)
cv.circle(input, (coords[10], coords[11]), 2, (255, 0, 255), thickness)
cv.circle(input, (coords[12], coords[13]), 2, (0, 255, 255), thickness)
cv.putText(input, 'FPS: {:.2f}'.format(fps), (1, 16), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
if __name__ == '__main__':

detector = cv.FaceDetectorYN.create(
args.face_detection_model,
"",
(320, 320),
args.score_threshold,
args.nms_threshold,
args.top_k
)

tm = cv.TickMeter()
# If input is an image
if args.image1 is not None:
img1 = cv.imread(cv.samples.findFile(args.image1))
img1Width = int(img1.shape[1]*args.scale)
img1Height = int(img1.shape[0]*args.scale)
img1 = cv.resize(img1, (img1Width, img1Height))
tm.start()

detector.setInputSize((img1Width, img1Height))
faces1 = detector.detect(img1)

tm.stop()
assert faces1[1] is not None, 'Cannot find a face in {}'.format(args.image1)
# Draw results on the input image
visualize(img1, faces1, tm.getFPS())
# Save results if save is true
if args.save:
print('Results saved to result.jpg\n')
cv.imwrite('result.jpg', img1)
# Visualize results in a new window
cv.imshow("image1", img1)
if args.image2 is not None:
img2 = cv.imread(cv.samples.findFile(args.image2))
tm.reset()
tm.start()
detector.setInputSize((img2.shape[1], img2.shape[0]))
faces2 = detector.detect(img2)
tm.stop()
assert faces2[1] is not None, 'Cannot find a face in {}'.format(args.image2)
visualize(img2, faces2, tm.getFPS())
cv.imshow("image2", img2)

recognizer = cv.FaceRecognizerSF.create(
args.face_recognition_model,"")

face1_align = recognizer.alignCrop(img1, faces1[1][0])
face2_align = recognizer.alignCrop(img2, faces2[1][0])
# Extract features
face1_feature = recognizer.feature(face1_align)
face2_feature = recognizer.feature(face2_align)

cosine_similarity_threshold = 0.363
l2_similarity_threshold = 1.128

cosine_score = recognizer.match(face1_feature, face2_feature, cv.FaceRecognizerSF_FR_COSINE)
l2_score = recognizer.match(face1_feature, face2_feature, cv.FaceRecognizerSF_FR_NORM_L2)

msg = 'different identities'
if cosine_score >= cosine_similarity_threshold:
msg = 'the same identity'
print('They have {}. Cosine Similarity: {}, threshold: {} (higher value means higher similarity, max 1.0).'.format(msg, cosine_score, cosine_similarity_threshold))
msg = 'different identities'
if l2_score <= l2_similarity_threshold:
msg = 'the same identity'
print('They have {}. NormL2 Distance: {}, threshold: {} (lower value means higher similarity, min 0.0).'.format(msg, l2_score, l2_similarity_threshold))
cv.waitKey(0)
else: # Omit input to call default camera
if args.video is not None:
deviceId = args.video
else:
deviceId = 0
cap = cv.VideoCapture(deviceId)
frameWidth = int(cap.get(cv.CAP_PROP_FRAME_WIDTH)*args.scale)
frameHeight = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT)*args.scale)
detector.setInputSize([frameWidth, frameHeight])
while cv.waitKey(1) < 0:
hasFrame, frame = cap.read()
if not hasFrame:
print('No frames grabbed!')
break
frame = cv.resize(frame, (frameWidth, frameHeight))
# Inference
tm.start()
faces = detector.detect(frame) # faces is a tuple
tm.stop()
# Draw results on the input image
visualize(frame, faces, tm.getFPS())
# Visualize results
cv.imshow('Live', frame)
cv.destroyAllWindows()

Explanation

C++

// Initialize FaceDetectorYN

Ptr<FaceDetectorYN> detector = FaceDetectorYN::create(fd_modelPath, "", Size(320, 320), scoreThreshold, nmsThreshold, topK);

        // Set input size before inference
        detector->setInputSize(image1.size());
        Mat faces1;
        detector->detect(image1, faces1);
        if (faces1.rows < 1)
        {
            std::cerr << "Cannot find a face in " << input1 << std::endl;
            return 1;
        }

Python

    detector = cv.FaceDetectorYN.create(
        args.face_detection_model,
        "",
        (320, 320),
        args.score_threshold,
        args.nms_threshold,
        args.top_k
    )

        # Set input size before inference
        detector.setInputSize((img1Width, img1Height))
        faces1 = detector.detect(img1)

The detection output faces is a two-dimension array of type CV_32F, whose rows are the detected face instances, columns are the location of a face and 5 facial landmarks. The format of each row is as follows:

x1, y1, w, h, x_re, y_re, x_le, y_le, x_nt, y_nt, x_rcm, y_rcm, x_lcm, y_lcm

, where x1, y1, w, h are the top-left coordinates, width and height of the face bounding box, {x, y}_{re, le, nt, rcm, lcm} stands for the coordinates of right eye, left eye, nose tip, the right corner and left corner of the mouth respectively.

Face Recognition

Following Face Detection, run codes below to extract face feature from facial image.

C++

// Initialize FaceRecognizerSF

Ptr<FaceRecognizerSF> faceRecognizer = FaceRecognizerSF::create(fr_modelPath, "");

            // Aligning and cropping facial image through the first face of faces detected.
            Mat aligned_face1, aligned_face2;
            faceRecognizer->alignCrop(image1, faces1.row(0), aligned_face1);
            faceRecognizer->alignCrop(image2, faces2.row(0), aligned_face2);
            // Run feature extraction with given aligned_face
            Mat feature1, feature2;
            faceRecognizer->feature(aligned_face1, feature1);
            feature1 = feature1.clone();
            faceRecognizer->feature(aligned_face2, feature2);
            feature2 = feature2.clone();

Python

recognizer = cv.FaceRecognizerSF.create(

args.face_recognition_model,"")

            # Align faces
            face1_align = recognizer.alignCrop(img1, faces1[1][0])
            face2_align = recognizer.alignCrop(img2, faces2[1][0])
            # Extract features
            face1_feature = recognizer.feature(face1_align)
            face2_feature = recognizer.feature(face2_align)

After obtaining face features feature1 and feature2 of two facial images, run codes below to calculate the identity discrepancy between the two faces.

C++

double cos_score = faceRecognizer->match(feature1, feature2, FaceRecognizerSF::DisType::FR_COSINE);

double L2_score = faceRecognizer->match(feature1, feature2, FaceRecognizerSF::DisType::FR_NORM_L2);

Python

cosine_score = recognizer.match(face1_feature, face2_feature, cv.FaceRecognizerSF_FR_COSINE)

l2_score = recognizer.match(face1_feature, face2_feature, cv.FaceRecognizerSF_FR_NORM_L2)

For example, two faces have same identity if the cosine distance is greater than or equal to 0.363, or the normL2 distance is less than or equal to 1.128.

Reference:

https://github.com/ShiqiYu/libfacedetection
https://github.com/ShiqiYu/libfacedetection.train
https://github.com/zhongyy/SFace

Acknowledgement

Thanks Professor Shiqi Yu and Yuantao Feng for training and providing the face detection model.

Thanks Professor Deng, PhD Candidate Zhong and Master Candidate Wang for training and providing the face recognition model.