OpenCV  4.9.0-dev
Open Source Computer Vision
No Matches
Text detection model:
Download link:
Text recognition models can be downloaded directly here:
Download link:
and doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
How to convert from pb to onnx:
Using classes from here:
import torch
from models.crnn import CRNN
model = CRNN(32, 1, 37, 256)
dummy_input = torch.randn(1, 1, 32, 100)
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
For more information, please refer to doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown and doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
#include <iostream>
#include <fstream>
#include <opencv2/dnn.hpp>
using namespace cv;
using namespace cv::dnn;
const char* keys =
"{ help h | | Print help message. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ detModel dmp | | Path to a binary .pb file contains trained detector network.}"
"{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
"{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
"{ thr | 0.5 | Confidence threshold. }"
"{ nms | 0.4 | Non-maximum suppression threshold. }"
"{ recModel rmp | | Path to a binary .onnx file contains trained CRNN text recognition model. "
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
int main(int argc, char** argv)
// Parse command line arguments.
CommandLineParser parser(argc, argv, keys);
parser.about("Use this script to run TensorFlow implementation ( of "
"EAST: An Efficient and Accurate Scene Text Detector (");
if (argc == 1 || parser.has("help"))
return 0;
float confThreshold = parser.get<float>("thr");
float nmsThreshold = parser.get<float>("nms");
int width = parser.get<int>("width");
int height = parser.get<int>("height");
int imreadRGB = parser.get<int>("RGBInput");
String detModelPath = parser.get<String>("detModel");
String recModelPath = parser.get<String>("recModel");
String vocPath = parser.get<String>("vocabularyPath");
if (!parser.check())
return 1;
// Load networks.
CV_Assert(!detModelPath.empty() && !recModelPath.empty());
TextDetectionModel_EAST detector(detModelPath);
TextRecognitionModel recognizer(recModelPath);
// Load vocabulary
std::ifstream vocFile;;
String vocLine;
std::vector<String> vocabulary;
while (std::getline(vocFile, vocLine)) {
// Parameters for Recognition
double recScale = 1.0 / 127.5;
Scalar recMean = Scalar(127.5, 127.5, 127.5);
Size recInputSize = Size(100, 32);
recognizer.setInputParams(recScale, recInputSize, recMean);
// Parameters for Detection
double detScale = 1.0;
Size detInputSize = Size(width, height);
Scalar detMean = Scalar(123.68, 116.78, 103.94);
bool swapRB = true;
detector.setInputParams(detScale, detInputSize, detMean, swapRB);
// Open a video file or an image file or a camera stream.
bool openSuccess = parser.has("input") ?<String>("input")) :;
static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
Mat frame;
while (waitKey(1) < 0)
cap >> frame;
if (frame.empty())
std::cout << frame.size << std::endl;
// Detection
std::vector< std::vector<Point> > detResults;
detector.detect(frame, detResults);
Mat frame2 = frame.clone();
if (detResults.size() > 0) {
// Text Recognition
Mat recInput;
if (!imreadRGB) {
cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
} else {
recInput = frame;
std::vector< std::vector<Point> > contours;
for (uint i = 0; i < detResults.size(); i++)
const auto& quadrangle = detResults[i];
CV_CheckEQ(quadrangle.size(), (size_t)4, "");
std::vector<Point2f> quadrangle_2f;
for (int j = 0; j < 4; j++)
Mat cropped;
fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
std::string recognitionResult = recognizer.recognize(cropped);
std::cout << i << ": '" << recognitionResult << "'" << std::endl;
putText(frame2, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255), 2);
polylines(frame2, contours, true, Scalar(0, 255, 0), 2);
imshow(kWinName, frame2);
return 0;
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
const Size outputSize = Size(100, 32);
Point2f targetVertices[4] = {
Point(0, outputSize.height - 1),
Point(0, 0), Point(outputSize.width - 1, 0),
Point(outputSize.width - 1, outputSize.height - 1)
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
warpPerspective(frame, result, rotationMatrix, outputSize);
#define CV_CheckEQ(v1, v2, msg)
Supported values of these types: int, float, double.
Definition check.hpp:118
Designed for command line parsing.
Definition utility.hpp:820
n-dimensional dense array class
Definition mat.hpp:812
CV_NODISCARD_STD Mat clone() const
Creates a full copy of the array and the underlying data.
Template class for specifying the size of an image or rectangle.
Definition types.hpp:335
_Tp height
the height
Definition types.hpp:363
_Tp width
the width
Definition types.hpp:362
Class for video capturing from video files, image sequences or cameras.
Definition videoio.hpp:731
virtual bool open(const String &filename, int apiPreference=CAP_ANY)
Opens a video file or a capturing device or an IP video stream for video capturing.
This class represents high-level API for text detection DL networks compatible with EAST model.
Definition dnn.hpp:1827
This class represents high-level API for text recognition networks.
Definition dnn.hpp:1671
std::string String
Definition cvstd.hpp:151
uint32_t uint
Definition interface.h:42
#define CV_Assert(expr)
Checks a condition at runtime and throws exception if it fails.
Definition base.hpp:342
void imshow(const String &winname, InputArray mat)
Displays an image in the specified window.
int waitKey(int delay=0)
Waits for a pressed key.
void cvtColor(InputArray src, OutputArray dst, int code, int dstCn=0)
Converts an image from one color space to another.
convert between RGB/BGR and grayscale, color conversions
Definition imgproc.hpp:555
void putText(InputOutputArray img, const String &text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false)
Draws a text string.
void polylines(InputOutputArray img, InputArrayOfArrays pts, bool isClosed, const Scalar &color, int thickness=1, int lineType=LINE_8, int shift=0)
Draws several polygonal curves.
Mat getPerspectiveTransform(InputArray src, InputArray dst, int solveMethod=DECOMP_LU)
Calculates a perspective transform from four pairs of the corresponding points.
void warpPerspective(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags=INTER_LINEAR, int borderMode=BORDER_CONSTANT, const Scalar &borderValue=Scalar())
Applies a perspective transformation to an image.
int main(int argc, char *argv[])
Definition highgui_qt.cpp:3
Definition all_layers.hpp:47
"black box" representation of the file storage associated with a file on disk.
Definition core.hpp:102