优化ocr检测时间,加载默认模型

2025-10-10 11:35:37 +08:00
parent 8c2e497163
commit 6cdb1e3d7d
5 changed files with 250 additions and 111 deletions
--- a/service/ocr_service.py
+++ b/service/ocr_service.py
@ -1,4 +1,4 @@
-# 首先添加NumPy兼容处理
+import time
 import numpy as np

 # 修复np.int已弃用的问题
@ -8,29 +8,120 @@ if not hasattr(np, 'int'):
 from paddleocr import PaddleOCR
 from service.sensitive_service import get_all_sensitive_words

+
+# AC自动机节点定义
+class AhoNode:
+    def __init__(self):
+        self.children = {}  # 子节点映射（字符->节点）
+        self.fail = None  # 失败指针（类似KMP的next数组）
+        self.is_end = False  # 标记是否为某个模式串的结尾
+        self.word = None  # 存储当前结尾对应的完整违禁词
+
+
+# AC自动机实现（多模式字符串匹配）
+class AhoCorasick:
+    def __init__(self):
+        self.root = AhoNode()  # 根节点
+
+    def add_word(self, word):
+        """添加违禁词到Trie树"""
+        if not isinstance(word, str) or not word.strip():
+            return  # 过滤无效词
+        node = self.root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = AhoNode()
+            node = node.children[char]
+        node.is_end = True
+        node.word = word  # 记录完整词
+
+    def build_fail(self):
+        """构建失败指针（BFS遍历）"""
+        queue = []
+        # 根节点的子节点失败指针指向根节点
+        for child in self.root.children.values():
+            child.fail = self.root
+            queue.append(child)
+
+        # BFS处理其他节点
+        while queue:
+            current_node = queue.pop(0)
+            # 遍历当前节点的所有子节点
+            for char, child in current_node.children.items():
+                # 寻找失败指针目标节点
+                fail_node = current_node.fail
+                while fail_node is not None and char not in fail_node.children:
+                    fail_node = fail_node.fail
+                # 确定失败指针指向
+                child.fail = fail_node.children[char] if (fail_node and char in fail_node.children) else self.root
+                queue.append(child)
+
+    def match(self, text):
+        """匹配文本中所有出现的违禁词（去重）"""
+        result = set()
+        node = self.root
+        for char in text:
+            # 沿失败链查找可用节点
+            while node is not None and char not in node.children:
+                node = node.fail
+            # 重置到根节点（如果没找到）
+            node = node.children[char] if (node and char in node.children) else self.root
+
+            # 收集所有匹配的违禁词（包括失败链上的）
+            temp = node
+            while temp != self.root:
+                if temp.is_end:
+                    result.add(temp.word)
+                temp = temp.fail
+        return list(result)
+
+
+# 全局变量
 _ocr_engine = None
-_forbidden_words = set()
+_ac_automaton = None  # 替换原有的_forbidden_words集合
 _conf_threshold = 0.5

+
 def set_forbidden_words(new_words):
-    global _forbidden_words
+    """更新违禁词（使用AC自动机存储）"""
+    global _ac_automaton
    if not isinstance(new_words, (set, list, tuple)):
        raise TypeError("新违禁词必须是集合、列表或元组类型")
-    _forbidden_words = set(new_words)  # 确保是集合类型
-    print(f"已通过函数更新违禁词，当前数量: {len(_forbidden_words)}")
+
+    # 初始化AC自动机并添加有效词
+    _ac_automaton = AhoCorasick()
+    valid_words = [word for word in new_words if isinstance(word, str) and word.strip()]
+    for word in valid_words:
+        _ac_automaton.add_word(word.strip())
+    # 构建失败指针（关键步骤）
+    _ac_automaton.build_fail()
+
+    print(f"已通过函数更新违禁词，当前数量: {len(valid_words)}")
+

 def load_forbidden_words():
-    global _forbidden_words
+    """从敏感词服务加载违禁词并初始化AC自动机"""
+    global _ac_automaton
    try:
-        _forbidden_words = get_all_sensitive_words()
-        print(f"加载的违禁词数量: {len(_forbidden_words)}")
+        sensitive_words = get_all_sensitive_words()  # 保持原接口不变（返回list[str]）
+        _ac_automaton = AhoCorasick()
+
+        # 添加所有有效敏感词
+        valid_words = [word for word in sensitive_words if isinstance(word, str) and word.strip()]
+        for word in valid_words:
+            _ac_automaton.add_word(word.strip())
+
+        # 构建失败指针
+        _ac_automaton.build_fail()
+        print(f"加载的违禁词数量: {len(valid_words)}")
+        return True
    except Exception as e:
        print(f"Forbidden words load error: {e}")
        return False
-    return True


 def init_ocr_engine():
+    """初始化OCR引擎和违禁词自动机"""
    global _ocr_engine
    try:
        _ocr_engine = PaddleOCR(
@ -52,34 +143,39 @@ def init_ocr_engine():


 def detect(frame, conf_threshold=0.8):
+    """检测帧中的文本是否包含违禁词（拆分OCR和匹配时间）"""
    print("开始进行OCR检测...")
+    total_start = time.time()  # 总耗时开始
+    ocr_time = 0.0  # OCR及结果解析耗时
+    match_time = 0.0  # 违禁词匹配耗时
+
    try:
+        if not _ocr_engine or not _ac_automaton:
+            return (False, "OCR引擎或违禁词库未初始化")
+
+        # 1. OCR识别及结果解析阶段
+        ocr_start = time.time()
        ocr_res = _ocr_engine.ocr(frame, cls=True)
        if not ocr_res or not isinstance(ocr_res, list):
            return (False, "无OCR结果")

        texts = []
        confs = []
+        # 解析OCR结果
        for line in ocr_res:
            if line is None:
                continue
-            if isinstance(line, list):
-                items_to_process = line
-            else:
-                items_to_process = [line]
+            items_to_process = line if isinstance(line, list) else [line]

            for item in items_to_process:
+                # 过滤坐标类数据
                if isinstance(item, list) and len(item) == 4:
-                    is_coordinate = True
-                    for point in item:
-                        if not (isinstance(point, list) and len(point) == 2 and
-                                all(isinstance(coord, (int, float)) for coord in point)):
-                            is_coordinate = False
-                            break
+                    is_coordinate = all(isinstance(p, list) and len(p) == 2 and
+                                        all(isinstance(c, (int, float)) for c in p)
+                                        for p in item)
                    if is_coordinate:
                        continue
-                if isinstance(item, list) and all(isinstance(x, (int, float)) for x in item):
-                    continue
+                # 提取文本和置信度
                if isinstance(item, tuple) and len(item) == 2:
                    text, conf = item
                    if isinstance(text, str) and isinstance(conf, (int, float)):
@ -98,22 +194,26 @@ def detect(frame, conf_threshold=0.8):
                        texts.append(text_data.strip())
                        confs.append(1.0)
                        continue
-                print(f"无法解析的OCR结果格式: {item}")
+        ocr_end = time.time()
+        ocr_time = ocr_end - ocr_start  # 计算OCR阶段耗时

        if len(texts) != len(confs):
            return (False, "OCR结果格式异常")

-        # 收集所有识别到的违禁词（去重且保持出现顺序）
+        # 2. 违禁词匹配阶段
+        match_start = time.time()
        vio_words = []
        for txt, conf in zip(texts, confs):
-            if conf < _conf_threshold:  # 过滤低置信度结果
+            if conf < _conf_threshold:
                continue
-            # 提取当前文本中包含的违禁词
-            matched = [w for w in _forbidden_words if w in txt]
-            # 仅添加未记录过的违禁词（去重）
-            for word in matched:
+            # 用AC自动机匹配当前文本中的所有违禁词
+            matched_words = _ac_automaton.match(txt)
+            # 全局去重并保持顺序
+            for word in matched_words:
                if word not in vio_words:
                    vio_words.append(word)
+        match_end = time.time()
+        match_time = match_end - match_start  # 计算匹配阶段耗时

        has_text = len(texts) > 0
        has_violation = len(vio_words) > 0
@ -121,11 +221,17 @@ def detect(frame, conf_threshold=0.8):
        if not has_text:
            return (False, "未识别到文本")
        elif has_violation:
-            # 多个违禁词用逗号拼接
            return (True, ", ".join(vio_words))
        else:
            return (False, "未检测到违禁词")

    except Exception as e:
        print(f"OCR detect error: {e}")
-        return (False, f"检测错误: {str(e)}")
+        return (False, f"检测错误: {str(e)}")
+    finally:
+        # 打印各阶段耗时
+        total_time = time.time() - total_start
+        print(f"当前帧耗时明细：")
+        print(f"  OCR识别及解析：{ocr_time:.8f}秒")
+        print(f"  违禁词匹配：{match_time:.8f}秒")
+        print(f"  总耗时：{total_time:.8f}秒")