#include "pch.h" #include "OCRCharset.h" #include "EasyOCR_Recognizer.h" uns::EasyOCR_Recognizer::NormalizePAD::Size3i uns::EasyOCR_Recognizer::NormalizePAD::Size3i::operator=(const Size3i& obj) { d0 = obj.d0; d1 = obj.d1; d2 = obj.d2; return (*this); } uns::EasyOCR_Recognizer::NormalizePAD::NormalizePAD(Size3i max_size, const std::string& PAD_type) { this->max_size = max_size; this->PAD_type = PAD_type; max_width_half = max_size.d2 / 2; // 计算宽度的一半，用于可选操作 } cv::Mat uns::EasyOCR_Recognizer::NormalizePAD::operator()(const cv::Mat& input_img) const { // 将原图转换为32位浮点型并归一化到[0,1] cv::Mat img; input_img.convertTo(img, CV_32F, 1.0 / 255); // line 10: img = toTensor img = (img - 0.5f) / 0.5f; // line 11: img.sub_(0.5).div_(0.5) int h = img.rows; // 获取图像高度 int w = img.cols; // 获取图像宽度 int c = img.channels(); // 获取通道数，灰度图默认为1 // 创建目标大小的全零Mat，类型为32F，尺寸为max_size.d1 x max_size.d2 cv::Mat pad_img = cv::Mat::zeros(max_size.d1, max_size.d2, CV_32FC(c)); // line 13 // 将原图像拷贝到pad_img的左上角区域，实现右侧填充 img.copyTo(pad_img(cv::Rect(0, 0, w, h))); // line 14 // 如果目标宽度大于原图宽度，则使用最后一列像素进行扩展填充 if (max_size.d2 != w) { // line 15 cv::Mat last_col = img.col(w - 1); cv::Mat border; cv::repeat(last_col, 1, max_size.d2 - w, border); // 重复最后一列填充 border.copyTo(pad_img(cv::Rect(w, 0, max_size.d2 - w, h))); } return pad_img; // 返回处理后的浮点张量 } cv::Mat uns::EasyOCR_Recognizer::AlignCollate::AdjustContrastGrey(const cv::Mat& img_in, double target) const { double contrast; int high, low; ContrastGrey(img_in, contrast, high, low); cv::Mat img = img_in.clone(); if (contrast < target) { cv::Mat img_i; img.convertTo(img_i, CV_32S); double ratio = 200.0 / std::max(10, high - low); img_i = (img_i - low + 25) * ratio; // 将像素值限制在[0,255]范围，并转换回8位 img_i.forEach([] (int& pixel, const int*) { pixel = std::clamp(pixel, 0, 255); }); img_i.convertTo(img, CV_8U); } return img; } void uns::EasyOCR_Recognizer::AlignCollate::ContrastGrey(const cv::Mat& img, double& contrast, int& high, int& low) const { // 将Mat图像数据复制到一个连续的vector中，以便排序 std::vector pixels; pixels.reserve(img.rows * img.cols); // 预分配空间以提高效率 for (int i = 0; i < img.rows; ++i) { const uchar* row_ptr = img.ptr(i); for (int j = 0; j < img.cols; ++j) pixels.push_back(static_cast(row_ptr[j])); } // 对像素值进行排序，便于获取百分位数 std::sort(pixels.begin(), pixels.end()); // 计算90%的索引位置，与Python np.percentile保持一致 int idx90 = static_cast(0.9 * (pixels.size() - 1)); int idx10 = static_cast(0.1 * (pixels.size() - 1)); high = pixels[idx90]; low = pixels[idx10]; // 计算contrast: (high - low) / max(10, high + low) contrast = double(high - low) / double(std::max(10, high + low)); } uns::EasyOCR_Recognizer::AlignCollate::AlignCollate(int imgH, int imgW, bool keep_ratio_with_pad, double adjust_contrast) { this->imgH = imgH; this->imgW = imgW; this->adjust_contrast = adjust_contrast; this->keep_ratio_with_pad = keep_ratio_with_pad; } cv::Mat uns::EasyOCR_Recognizer::AlignCollate::operator()(const std::vector& batch) const { std::vector resized_images; // 创建NormalizePAD实例，用于归一化和填充 NormalizePAD transform({ 1, imgH, imgW }); for (const cv::Mat& image : batch) { cv::Mat working; if (adjust_contrast > 0) { cv::Mat grey; if (image.channels() > 1) cv::cvtColor(image, grey, cv::COLOR_BGR2GRAY); else grey = image; working = AdjustContrastGrey(grey, adjust_contrast); } else working = image; int w = working.cols; int h = working.rows; double ratio = double(w) / h; int resized_w = static_cast(std::ceil(imgH * ratio)); if (resized_w > imgW) resized_w = imgW; cv::Mat resized; cv::resize(working, resized, cv::Size(resized_w, imgH), 0, 0, cv::INTER_CUBIC); cv::Mat tensor = transform(resized); resized_images.push_back(tensor); } cv::Mat blob; cv::dnn::blobFromImages(resized_images, blob); return blob; } float uns::EasyOCR_Recognizer::CustomMean(const VecFloat& x) { size_t N = x.size(); if (N == 0) return 0.0f; // 1. 计算所有元素的乘积 double prod = 1.0; for (float v : x) if (v != 0) prod *= static_cast(v); // 2. 计算指数 2.0 / sqrt(N) double exponent = 2.0 / std::sqrt(static_cast(N)); // 3. 返回 prod 的 exponent 次幂 return static_cast(std::pow(prod, exponent)); } cv::Mat uns::EasyOCR_Recognizer::Preprocess(const cv::Mat& img) const { if (img.empty()) return {}; //此处不适合抛出异常，使用空图像终止后级的处理即可 cv::Mat gray; int ch = img.channels(); // case 2: BGR 彩色图（3 通道） if (ch == 3) cv::cvtColor(img, gray, cv::COLOR_BGR2GRAY); // case 3: RGBA 彩色图（4 通道） else if (ch == 4) { // 去掉 alpha 通道，把 BGRA → GRAY cv::Mat bgr; cv::cvtColor(img, gray, cv::COLOR_BGRA2GRAY); } else // image 本身可能是 (h×w) 或者 (h×w×1)，对我们来说都当灰度处理 gray = img; int width = gray.cols; int height = gray.rows; int model_height = 64, model_width = 0; float ratio = static_cast(width) / static_cast(height); cv::Mat resized; if (ratio < 1.0f) { // 垂直文本情况，使用 calculate_ratio 保证高度为 model_height float adj_ratio = CalculateRatio(width, height); model_width = static_cast(model_height * adj_ratio); cv::resize(gray, resized, cv::Size(model_height, model_width), 0, 0, cv::INTER_LINEAR); ratio = adj_ratio; } else { // 横向文本情况，高度为 model_height model_width = static_cast(model_height * ratio); cv::resize(gray, resized, cv::Size(model_width, model_height), 0, 0, cv::INTER_LINEAR); } AlignCollate alignCollate(model_height, model_width, true, 0.5); return alignCollate({ resized }); } float uns::EasyOCR_Recognizer::CalculateRatio(int width, int height) const { float ratio = static_cast(width) / static_cast(height); if (ratio < 1.0f) ratio = 1.0f / ratio; return ratio; } uns::VecFloat uns::EasyOCR_Recognizer::SoftMAX(const float* logits, int C) const { // 找到最大值以稳定数值 float m = logits[0]; for (int i = 1; i < C; ++i) m = std::max(m, logits[i]); // 计算 exp(logit - m) std::vector exps(C); float sum = 0.f; for (int i = 0; i < C; ++i) { exps[i] = std::exp(logits[i] - m); sum += exps[i]; } // 归一化 for (int i = 0; i < C; ++i) exps[i] /= (sum > 1e-6f ? sum : 1e-6f); return exps; } void uns::EasyOCR_Recognizer::PostprocessONNXOutput(const Ort::Value& outputs, int N, int T, int C, VecInt& out_indices, VecFloat& out_probs, const VecInt ignore_idx) { // 指针访问底层数据 const float* data = outputs.GetTensorData(); out_indices.clear(); out_probs.clear(); // 临时存储每步概率 std::vector probs; probs.reserve(C); // 遍历每个样本、每个时间步 for (int n = 0; n < N; ++n) { for (int t = 0; t < T; ++t) { // logits 起始位置: ((n * T) + t) * C const float* logits = data + ((size_t)n * T + t) * C; // 1) Softmax probs = SoftMAX(logits, C); // 2) 忽略 ignore_idx if (!ignore_idx.empty()) for (const auto& idx : ignore_idx) probs[idx] = 0.f; // 3) 再次归一化 float sum = 0.f; for (int c = 0; c < C; ++c) sum += probs[c]; if (sum > 1e-6f) { for (int c = 0; c < C; ++c) probs[c] /= sum; } // 4) 取最大索引 int best = 0; float best_prob = 0.0f; for (int c = 1; c < C; ++c) { if (probs[c] > probs[best]) { best = c; best_prob = probs[c]; } } out_indices.push_back(best); out_probs.push_back(best_prob); } } } uns::EasyOCR_Recognizer::EasyOCR_Recognizer() { ort_inited = false; ort_cpu_session = nullptr; model_path = G_OCRConfig.GetRecognizeModelPath(); ort = OrtGetApiBase()->GetApi(ORT_API_VERSION); } bool uns::EasyOCR_Recognizer::Init() { if (ort_inited) return true; if (!RecheckModelInfo()) return false; try { ort_env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "EasyOCR_Recognizer"); bool fallback_to_cpu = false; if (!OCRToolBox::AutoSelectEP(ort, ort_session_options, fallback_to_cpu)) return false; OCRToolBox::InitOrtSessionOptions(ort_session_options); if ((G_OCRConfig.GetGPUUsage() == easyocr::GPUUsage::CPUOnly) || fallback_to_cpu) //使用CPU则初始化cpu session { ort_cpu_session = new Ort::Session(ort_env, model_path.c_str(), ort_session_options); //通过CPU session获取输入输出名 OCRToolBox::GetInputOutputNames(ort_cpu_session, input_names, input_ns, output_names, output_ns); } else { //通过临时session获取输入输出名（CUDA：线程不安全） Ort::Session ort_session(ort_env, model_path.c_str(), ort_session_options); OCRToolBox::GetInputOutputNames(&ort_session, input_names, input_ns, output_names, output_ns); } ort_inited = true; return true; } catch (...) { return false; } } bool uns::EasyOCR_Recognizer::UnInit() { try { if (ort_cpu_session != nullptr) delete ort_cpu_session; ort_cpu_session = nullptr; return true; } catch (...) { return false; } } bool uns::EasyOCR_Recognizer::RecheckModelInfo() { if (model_path.empty()) model_path = G_OCRConfig.GetRecognizeModelPath(); return OCRToolBox::CheckFile(model_path); } uns::EOCR_Result uns::EasyOCR_Recognizer::operator()(const cv::Mat& image) { try { if (!RecheckModelInfo()) return { L"", -1.0f }; cv::Mat input = Preprocess(image); if (input.empty()) return { L"", 0.0f }; std::array inputShape = { 1, 1, input.size[2], input.size[3] }; Ort::MemoryInfo memInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); Ort::Value inputTensor = Ort::Value::CreateTensor(memInfo, input.ptr(), input.total(), inputShape.data(), inputShape.size()); auto outputs = ((ort_cpu_session != nullptr) ? ort_cpu_session->Run(Ort::RunOptions{nullptr}, input_names.data(), & inputTensor, 1, output_names.data(), 1) : Ort::Session(ort_env, model_path.c_str(), ort_session_options).Run(Ort::RunOptions{nullptr}, input_names.data(), & inputTensor, 1, output_names.data(), 1)); // 输出 shape: [1, T, C] auto& outVal = outputs.front(); auto info = outVal.GetTensorTypeAndShapeInfo(); auto shape = info.GetShape(); // {1, T, C} int N = (int)shape[0], T = (int)shape[1], C = (int)shape[2]; float* data = outVal.GetTensorMutableData(); // greedy pick & softmax std::vector indices(T); std::vector maxProbs(T); PostprocessONNXOutput(outputs[0], N, T, C, indices, maxProbs); // 解码 std::wstring text = OCRCharset::GetString(indices); // 置信度 float conf = CustomMean(maxProbs); return { text, conf }; } catch (...) { return { L"", -2.0f }; } } uns::EOCR_ResultSet uns::EasyOCR_Recognizer::operator()(const cv::Mat& full_image, const EOCRD_Rects& rects) { if (!RecheckModelInfo()) return {}; try { EOCR_ResultSet result_set; for (size_t i = 0; i < rects.size(); ++i) { // 将多边形转为最小外接矩形并裁剪 cv::Rect rect = cv::boundingRect(rects[i]); rect &= cv::Rect(0, 0, full_image.cols, full_image.rows); // 裁剪到图像范围 cv::Mat crop = full_image(rect); if (crop.empty()) continue; auto [text, conf] = (*this)(crop); result_set.insert({ i, { text, conf, rect } }); } return result_set; } catch (...) { return {}; } }