You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

390 lines
11 KiB
C++

#include "pch.h"
#include "OCRCharset.h"
#include "EasyOCR_Recognizer.h"
uns::EasyOCR_Recognizer::NormalizePAD::Size3i uns::EasyOCR_Recognizer::NormalizePAD::Size3i::operator=(const Size3i& obj)
{
d0 = obj.d0;
d1 = obj.d1;
d2 = obj.d2;
return (*this);
}
uns::EasyOCR_Recognizer::NormalizePAD::NormalizePAD(Size3i max_size, const std::string& PAD_type)
{
this->max_size = max_size;
this->PAD_type = PAD_type;
max_width_half = max_size.d2 / 2; // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȵ<EFBFBD>һ<EFBFBD><EFBFBD><EBA3AC><EFBFBD>ڿ<EFBFBD>ѡ<EFBFBD><D1A1><EFBFBD><EFBFBD>
}
cv::Mat uns::EasyOCR_Recognizer::NormalizePAD::operator()(const cv::Mat& input_img) const
{
// <20><>ԭͼת<CDBC><D7AA>Ϊ32λ<32><CEBB><EFBFBD><EFBFBD><EFBFBD>Ͳ<EFBFBD><CDB2><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD>[0,1]
cv::Mat img;
input_img.convertTo(img, CV_32F, 1.0 / 255); // line 10: img = toTensor
img = (img - 0.5f) / 0.5f; // line 11: img.sub_(0.5).div_(0.5)
int h = img.rows; // <20><>ȡͼ<C8A1><CDBC><EFBFBD>߶<EFBFBD>
int w = img.cols; // <20><>ȡͼ<C8A1><CDBC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
int c = img.channels(); // <20><>ȡͨ<C8A1><CDA8><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҷ<EFBFBD>ͼĬ<CDBC><C4AC>Ϊ1
// <20><><EFBFBD><EFBFBD>Ŀ<EFBFBD><C4BF><EFBFBD><EFBFBD>С<EFBFBD><D0A1>ȫ<EFBFBD><C8AB>Mat<61><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ32F<32><46><EFBFBD>ߴ<EFBFBD>Ϊmax_size.d1 x max_size.d2
cv::Mat pad_img = cv::Mat::zeros(max_size.d1, max_size.d2, CV_32FC(c)); // line 13
// <20><>ԭͼ<D4AD>񿽱<EFBFBD><F1BFBDB1><EFBFBD>pad_img<6D><67><EFBFBD><EFBFBD><EFBFBD>Ͻ<EFBFBD><CFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5><EFBFBD>Ҳ<EFBFBD><D2B2><EFBFBD><EFBFBD><EFBFBD>
img.copyTo(pad_img(cv::Rect(0, 0, w, h))); // line 14
// <20><><EFBFBD><EFBFBD>Ŀ<EFBFBD><C4BF><EFBFBD><EFBFBD><EFBFBD>ȴ<EFBFBD><C8B4><EFBFBD>ԭͼ<D4AD><CDBC><EFBFBD>ȣ<EFBFBD><C8A3><EFBFBD>ʹ<EFBFBD><CAB9><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD>ؽ<EFBFBD><D8BD><EFBFBD><EFBFBD><EFBFBD>չ<EFBFBD><D5B9><EFBFBD><EFBFBD>
if (max_size.d2 != w)
{ // line 15
cv::Mat last_col = img.col(w - 1);
cv::Mat border;
cv::repeat(last_col, 1, max_size.d2 - w, border); // <20>ظ<EFBFBD><D8B8><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
border.copyTo(pad_img(cv::Rect(w, 0, max_size.d2 - w, h)));
}
return pad_img; // <20><><EFBFBD>ش<EFBFBD><D8B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD><C4B8><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
}
cv::Mat uns::EasyOCR_Recognizer::AlignCollate::AdjustContrastGrey(const cv::Mat& img_in, double target) const
{
double contrast;
int high, low;
ContrastGrey(img_in, contrast, high, low);
cv::Mat img = img_in.clone();
if (contrast < target)
{
cv::Mat img_i;
img.convertTo(img_i, CV_32S);
double ratio = 200.0 / std::max(10, high - low);
img_i = (img_i - low + 25) * ratio;
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>[0,255]<5D><>Χ<EFBFBD><CEA7><EFBFBD><EFBFBD>ת<EFBFBD><D7AA><EFBFBD><EFBFBD>
img_i.forEach<int>([] (int& pixel, const int*)
{
pixel = std::clamp(pixel, 0, 255);
});
img_i.convertTo(img, CV_8U);
}
return img;
}
void uns::EasyOCR_Recognizer::AlignCollate::ContrastGrey(const cv::Mat& img, double& contrast, int& high, int& low) const
{
// <20><>Matͼ<74><CDBC><EFBFBD><EFBFBD><EFBFBD>ݸ<EFBFBD><DDB8>Ƶ<EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>vector<int><3E>У<EFBFBD><D0A3>Ա<EFBFBD><D4B1><EFBFBD><EFBFBD><EFBFBD>
std::vector<int> pixels;
pixels.reserve(img.rows * img.cols); // Ԥ<><D4A4><EFBFBD><EFBFBD><EFBFBD>ռ<EFBFBD><D5BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ч<EFBFBD><D0A7>
for (int i = 0; i < img.rows; ++i)
{
const uchar* row_ptr = img.ptr<uchar>(i);
for (int j = 0; j < img.cols; ++j)
pixels.push_back(static_cast<int>(row_ptr[j]));
}
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>򣬱<EFBFBD><F2A3ACB1>ڻ<EFBFBD>ȡ<EFBFBD>ٷ<EFBFBD>λ<EFBFBD><CEBB>
std::sort(pixels.begin(), pixels.end());
// <20><><EFBFBD><EFBFBD>90%<25><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>λ<EFBFBD>ã<EFBFBD><C3A3><EFBFBD>Python np.percentile<6C><65><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>
int idx90 = static_cast<int>(0.9 * (pixels.size() - 1));
int idx10 = static_cast<int>(0.1 * (pixels.size() - 1));
high = pixels[idx90];
low = pixels[idx10];
// <20><><EFBFBD><EFBFBD>contrast: (high - low) / max(10, high + low)
contrast = double(high - low) / double(std::max(10, high + low));
}
uns::EasyOCR_Recognizer::AlignCollate::AlignCollate(int imgH, int imgW, bool keep_ratio_with_pad, double adjust_contrast)
{
this->imgH = imgH;
this->imgW = imgW;
this->adjust_contrast = adjust_contrast;
this->keep_ratio_with_pad = keep_ratio_with_pad;
}
cv::Mat uns::EasyOCR_Recognizer::AlignCollate::operator()(const std::vector<cv::Mat>& batch) const
{
std::vector<cv::Mat> resized_images;
// <20><><EFBFBD><EFBFBD>NormalizePADʵ<44><CAB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڹ<EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NormalizePAD transform({ 1, imgH, imgW });
for (const cv::Mat& image : batch)
{
cv::Mat working;
if (adjust_contrast > 0)
{
cv::Mat grey;
if (image.channels() > 1)
cv::cvtColor(image, grey, cv::COLOR_BGR2GRAY);
else
grey = image;
working = AdjustContrastGrey(grey, adjust_contrast);
}
else
working = image;
int w = working.cols;
int h = working.rows;
double ratio = double(w) / h;
int resized_w = static_cast<int>(std::ceil(imgH * ratio));
if (resized_w > imgW)
resized_w = imgW;
cv::Mat resized;
cv::resize(working, resized, cv::Size(resized_w, imgH), 0, 0, cv::INTER_CUBIC);
cv::Mat tensor = transform(resized);
resized_images.push_back(tensor);
}
cv::Mat blob;
cv::dnn::blobFromImages(resized_images, blob);
return blob;
}
float uns::EasyOCR_Recognizer::CustomMean(const VecFloat& x)
{
size_t N = x.size();
if (N == 0)
return 0.0f;
// 1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ԫ<EFBFBD>صij˻<C4B3>
double prod = 1.0;
for (float v : x)
if (v != 0)
prod *= static_cast<double>(v);
// 2. <20><><EFBFBD><EFBFBD>ָ<EFBFBD><D6B8> 2.0 / sqrt(N)
double exponent = 2.0 / std::sqrt(static_cast<double>(N));
// 3. <20><><EFBFBD><EFBFBD> prod <20><> exponent <20><><EFBFBD><EFBFBD>
return static_cast<float>(std::pow(prod, exponent));
}
cv::Mat uns::EasyOCR_Recognizer::Preprocess(const cv::Mat& img) const
{
if (img.empty())
return {}; //<2F>˴<EFBFBD><CBB4><EFBFBD><EFBFBD>ʺ<EFBFBD><CABA>׳<EFBFBD><D7B3><EFBFBD><ECB3A3>ʹ<EFBFBD>ÿ<EFBFBD>ͼ<EFBFBD><CDBC><EFBFBD><EFBFBD>ֹ<EFBFBD>󼶵Ĵ<F3BCB6B5><C4B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
cv::Mat gray;
int ch = img.channels();
// case 2: BGR <20><>ɫͼ<C9AB><CDBC>3 ͨ<><CDA8><EFBFBD><EFBFBD>
if (ch == 3)
cv::cvtColor(img, gray, cv::COLOR_BGR2GRAY);
// case 3: RGBA <20><>ɫͼ<C9AB><CDBC>4 ͨ<><CDA8><EFBFBD><EFBFBD>
else if (ch == 4)
{
// ȥ<><C8A5> alpha ͨ<><CDA8><EFBFBD><EFBFBD><EFBFBD><EFBFBD> BGRA <20><> GRAY
cv::Mat bgr;
cv::cvtColor(img, gray, cv::COLOR_BGRA2GRAY);
}
else // image <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> (h<><68>w) <20><><EFBFBD><EFBFBD> (h<><68>w<EFBFBD><77>1)<29><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˵<EFBFBD><CBB5><EFBFBD><EFBFBD><EFBFBD>Ҷȴ<D2B6><C8B4><EFBFBD>
gray = img;
int width = gray.cols;
int height = gray.rows;
int model_height = 64, model_width = 0;
float ratio = static_cast<float>(width) / static_cast<float>(height);
cv::Mat resized;
if (ratio < 1.0f)
{
// <20><>ֱ<EFBFBD>ı<EFBFBD><C4B1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʹ<EFBFBD><CAB9> calculate_ratio <20><>֤<EFBFBD>߶<EFBFBD>Ϊ model_height
float adj_ratio = CalculateRatio(width, height);
model_width = static_cast<int>(model_height * adj_ratio);
cv::resize(gray, resized, cv::Size(model_height, model_width), 0, 0, cv::INTER_LINEAR);
ratio = adj_ratio;
}
else
{
// <20><><EFBFBD><EFBFBD><EFBFBD>ı<EFBFBD><C4B1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>߶<EFBFBD>Ϊ model_height
model_width = static_cast<int>(model_height * ratio);
cv::resize(gray, resized, cv::Size(model_width, model_height), 0, 0, cv::INTER_LINEAR);
}
AlignCollate alignCollate(model_height, model_width, true, 0.5);
return alignCollate({ resized });
}
float uns::EasyOCR_Recognizer::CalculateRatio(int width, int height) const
{
float ratio = static_cast<float>(width) / static_cast<float>(height);
if (ratio < 1.0f)
ratio = 1.0f / ratio;
return ratio;
}
uns::VecFloat uns::EasyOCR_Recognizer::SoftMAX(const float* logits, int C) const
{
// <20>ҵ<EFBFBD><D2B5><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD>ȶ<EFBFBD><C8B6><EFBFBD>ֵ
float m = logits[0];
for (int i = 1; i < C; ++i)
m = std::max(m, logits[i]);
// <20><><EFBFBD><EFBFBD> exp(logit - m)
std::vector<float> exps(C);
float sum = 0.f;
for (int i = 0; i < C; ++i)
{
exps[i] = std::exp(logits[i] - m);
sum += exps[i];
}
// <20><>һ<EFBFBD><D2BB>
for (int i = 0; i < C; ++i)
exps[i] /= (sum > 1e-6f ? sum : 1e-6f);
return exps;
}
void uns::EasyOCR_Recognizer::PostprocessONNXOutput(const Ort::Value& outputs, int N, int T, int C, VecInt& out_indices, VecFloat& out_probs, const VecInt ignore_idx)
{
// ָ<><D6B8><EFBFBD><EFBFBD><EFBFBD>ʵײ<CAB5><D7B2><EFBFBD><EFBFBD><EFBFBD>
const float* data = outputs.GetTensorData<float>();
out_indices.clear();
out_probs.clear();
// <20><>ʱ<EFBFBD>洢ÿ<E6B4A2><C3BF><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
std::vector<float> probs;
probs.reserve(C);
// <20><><EFBFBD><EFBFBD>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿ<EFBFBD><C3BF>ʱ<EFBFBD>
for (int n = 0; n < N; ++n)
{
for (int t = 0; t < T; ++t)
{
// logits <20><>ʼλ<CABC><CEBB>: ((n * T) + t) * C
const float* logits = data + ((size_t)n * T + t) * C;
// 1) Softmax
probs = SoftMAX(logits, C);
// 2) <20><><EFBFBD><EFBFBD> ignore_idx
if (!ignore_idx.empty())
for (const auto& idx : ignore_idx)
probs[idx] = 0.f;
// 3) <20>ٴι<D9B4>һ<EFBFBD><D2BB>
float sum = 0.f;
for (int c = 0; c < C; ++c)
sum += probs[c];
if (sum > 1e-6f)
{
for (int c = 0; c < C; ++c)
probs[c] /= sum;
}
// 4) ȡ<><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
int best = 0;
float best_prob = 0.0f;
for (int c = 1; c < C; ++c)
{
if (probs[c] > probs[best])
{
best = c;
best_prob = probs[c];
}
}
out_indices.push_back(best);
out_probs.push_back(best_prob);
}
}
}
uns::EasyOCR_Recognizer::EasyOCR_Recognizer()
{
ort_inited = false;
ort_cpu_session = nullptr;
model_path = G_OCRConfig.GetRecognizeModelPath();
ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
}
bool uns::EasyOCR_Recognizer::Init()
{
if (ort_inited)
return true;
if (!RecheckModelInfo())
return false;
try
{
ort_env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "EasyOCR_Recognizer");
bool fallback_to_cpu = false;
if (!OCRToolBox::AutoSelectEP(ort, ort_session_options, fallback_to_cpu))
return false;
OCRToolBox::InitOrtSessionOptions(ort_session_options);
if ((G_OCRConfig.GetGPUUsage() == easyocr::GPUUsage::CPUOnly) || fallback_to_cpu) //ʹ<><CAB9>CPU<50><55><EFBFBD><EFBFBD>ʼ<EFBFBD><CABC>cpu session
{
ort_cpu_session = new Ort::Session(ort_env, model_path.c_str(), ort_session_options);
//ͨ<><CDA8>CPU session<6F><6E>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OCRToolBox::GetInputOutputNames(ort_cpu_session, input_names, input_ns, output_names, output_ns);
}
else
{
//ͨ<><CDA8><EFBFBD><EFBFBD>ʱsession<6F><6E>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>CUDA<44><41><EFBFBD>̲߳<DFB3><CCB2><EFBFBD>ȫ<EFBFBD><C8AB>
Ort::Session ort_session(ort_env, model_path.c_str(), ort_session_options);
OCRToolBox::GetInputOutputNames(&ort_session, input_names, input_ns, output_names, output_ns);
}
ort_inited = true;
return true;
}
catch (...)
{
return false;
}
}
bool uns::EasyOCR_Recognizer::UnInit()
{
try
{
if (ort_cpu_session != nullptr)
delete ort_cpu_session;
ort_cpu_session = nullptr;
return true;
}
catch (...)
{
return false;
}
}
bool uns::EasyOCR_Recognizer::RecheckModelInfo()
{
if (model_path.empty())
model_path = G_OCRConfig.GetRecognizeModelPath();
return OCRToolBox::CheckFile(model_path);
}
uns::EOCR_Result uns::EasyOCR_Recognizer::operator()(const cv::Mat& image)
{
try
{
if (!RecheckModelInfo())
return { L"", -1.0f };
cv::Mat input = Preprocess(image);
if (input.empty())
return { L"", 0.0f };
std::array<int64_t, 4> inputShape = { 1, 1, input.size[2], input.size[3] };
Ort::MemoryInfo memInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memInfo, input.ptr<float>(), input.total(), inputShape.data(), inputShape.size());
auto outputs = ((ort_cpu_session != nullptr) ? ort_cpu_session->Run(Ort::RunOptions{nullptr}, input_names.data(), & inputTensor, 1, output_names.data(), 1) : Ort::Session(ort_env, model_path.c_str(), ort_session_options).Run(Ort::RunOptions{nullptr}, input_names.data(), & inputTensor, 1, output_names.data(), 1));
// <20><><EFBFBD><EFBFBD> shape: [1, T, C]
auto& outVal = outputs.front();
auto info = outVal.GetTensorTypeAndShapeInfo();
auto shape = info.GetShape(); // {1, T, C}
int N = (int)shape[0], T = (int)shape[1], C = (int)shape[2];
float* data = outVal.GetTensorMutableData<float>();
// greedy pick & softmax
std::vector<int> indices(T);
std::vector<float> maxProbs(T);
PostprocessONNXOutput(outputs[0], N, T, C, indices, maxProbs);
// <20><><EFBFBD><EFBFBD>
std::wstring text = OCRCharset::GetString(indices);
// <20><><EFBFBD>Ŷ<EFBFBD>
float conf = CustomMean(maxProbs);
return { text, conf };
}
catch (...)
{
return { L"", -2.0f };
}
}
uns::EOCR_ResultSet uns::EasyOCR_Recognizer::operator()(const cv::Mat& full_image, const EOCRD_Rects& rects)
{
if (!RecheckModelInfo())
return {};
try
{
EOCR_ResultSet result_set;
for (size_t i = 0; i < rects.size(); ++i)
{
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>תΪ<D7AA><CEAA>С<EFBFBD><D0A1><EFBFBD>Ӿ<EFBFBD><D3BE>β<EFBFBD><CEB2>ü<EFBFBD>
cv::Rect rect = cv::boundingRect(rects[i]);
rect &= cv::Rect(0, 0, full_image.cols, full_image.rows); // <20>ü<EFBFBD><C3BC><EFBFBD>ͼ<EFBFBD><CDBC><EFBFBD><EFBFBD>Χ
cv::Mat crop = full_image(rect);
if (crop.empty())
continue;
auto [text, conf] = (*this)(crop);
result_set.insert({ i, { text, conf, rect } });
}
return result_set;
}
catch (...)
{
return {};
}
}