|
|
@@ -1,4 +1,5 @@
|
|
|
import os
|
|
|
+import html
|
|
|
import cv2
|
|
|
import numpy as np
|
|
|
from loguru import logger
|
|
|
@@ -8,6 +9,11 @@ from mineru.utils.enum_class import ModelPath
|
|
|
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
|
|
|
|
|
|
|
|
|
+def escape_html(input_string):
|
|
|
+ """Escape HTML Entities."""
|
|
|
+ return html.escape(input_string)
|
|
|
+
|
|
|
+
|
|
|
class RapidTableModel(object):
|
|
|
def __init__(self, ocr_engine):
|
|
|
slanet_plus_model_path = os.path.join(auto_download_and_get_model_root_path(ModelPath.slanet_plus), ModelPath.slanet_plus)
|
|
|
@@ -63,7 +69,7 @@ class RapidTableModel(object):
|
|
|
# Continue with OCR on potentially rotated image
|
|
|
ocr_result = self.ocr_engine.ocr(bgr_image)[0]
|
|
|
if ocr_result:
|
|
|
- ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
|
|
|
+ ocr_result = [[item[0], escape_html(item[1][0]), item[1][1]] for item in ocr_result if
|
|
|
len(item) == 2 and isinstance(item[1], tuple)]
|
|
|
else:
|
|
|
ocr_result = None
|