Browse Source

Fix: 表格内容中的HTML Entity会导致表格内容错乱 [#2694]

Li Xia 5 tháng trước cách đây
mục cha
commit
3854bd0fa0
2 tập tin đã thay đổi với 15 bổ sung1 xóa
  1. 7 1
      mineru/model/table/rapid_table.py
  2. 8 0
      signatures/version1/cla.json

+ 7 - 1
mineru/model/table/rapid_table.py

@@ -1,4 +1,5 @@
 import os
+import html
 import cv2
 import numpy as np
 from loguru import logger
@@ -8,6 +9,11 @@ from mineru.utils.enum_class import ModelPath
 from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
 
 
+def escape_html(input_string):
+    """Escape HTML Entities."""
+    return html.escape(input_string)
+
+
 class RapidTableModel(object):
     def __init__(self, ocr_engine):
         slanet_plus_model_path = os.path.join(auto_download_and_get_model_root_path(ModelPath.slanet_plus), ModelPath.slanet_plus)
@@ -63,7 +69,7 @@ class RapidTableModel(object):
         # Continue with OCR on potentially rotated image
         ocr_result = self.ocr_engine.ocr(bgr_image)[0]
         if ocr_result:
-            ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
+            ocr_result = [[item[0], escape_html(item[1][0]), item[1][1]] for item in ocr_result if
                       len(item) == 2 and isinstance(item[1], tuple)]
         else:
             ocr_result = None

+ 8 - 0
signatures/version1/cla.json

@@ -319,6 +319,14 @@
       "created_at": "2025-06-17T03:09:54Z",
       "repoId": 765083837,
       "pullRequestNo": 2676
+    },
+    {
+      "name": "hsia",
+      "id": 654127,
+      "comment_id": 2979415817,
+      "created_at": "2025-06-17T17:35:10Z",
+      "repoId": 765083837,
+      "pullRequestNo": 2699
     }
   ]
 }