2 månader sedan · 512f40fdfb
--- a/mineru/backend/pipeline/model_init.py
+++ b/mineru/backend/pipeline/model_init.py
@@ -33,7 +33,7 @@ def table_cls_model_init():
 
				     return PaddleTableClsModel()
			
 
				 
			
 
				 
			
 
				-def wired_table_model_init(lang=None):
			
 
				+def wired_table_model_init(lang="ch"):
			
 
				     atom_model_manager = AtomModelSingleton()
			
 
				     ocr_engine = atom_model_manager.get_atom_model(
			
 
				         atom_model_name=AtomicModel.OCR,
			
@@ -46,7 +46,7 @@ def wired_table_model_init(lang=None):
 
				     return table_model
			
 
				 
			
 
				 
			
 
				-def wireless_table_model_init(lang=None):
			
 
				+def wireless_table_model_init(lang="ch"):
			
 
				     atom_model_manager = AtomModelSingleton()
			
 
				     ocr_engine = atom_model_manager.get_atom_model(
			
 
				         atom_model_name=AtomicModel.OCR,
			
--- a/mineru/model/table/rec/slanet_plus/main.py
+++ b/mineru/model/table/rec/slanet_plus/main.py
@@ -12,6 +12,8 @@ from typing import Dict, List, Optional, Tuple, Union
 
				 import cv2
			
 
				 import numpy as np
			
 
				 from loguru import logger
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				 from .matcher import TableMatch
			
 
				 from .table_structure import TableStructurer
			
 
				 from mineru.utils.enum_class import ModelPath
			
@@ -72,40 +74,25 @@ class RapidTable:
 
				             raise ValueError(f"{self.model_type} is not supported.")
			
 
				         self.table_matcher = TableMatch()
			
 
				 
			
 
				-        try:
			
 
				-            self.ocr_engine = importlib.import_module("rapidocr").RapidOCR()
			
 
				-        except ModuleNotFoundError:
			
 
				-            self.ocr_engine = None
			
 
				-
			
 
				-    def __call__(
			
 
				+    def predict(
			
 
				         self,
			
 
				         img: np.ndarray,
			
 
				         ocr_result: List[Union[List[List[float]], str, str]] = None,
			
 
				     ) -> RapidTableOutput:
			
 
				-        if self.ocr_engine is None and ocr_result is None:
			
 
				-            raise ValueError(
			
 
				-                "One of two conditions must be met: ocr_result is not empty, or rapidocr is installed."
			
 
				-            )
			
 
				+        if ocr_result is None:
			
 
				+            raise ValueError("OCR result is None")
			
 
				 
			
 
				         s = time.perf_counter()
			
 
				         h, w = img.shape[:2]
			
 
				 
			
 
				-        if ocr_result is None:
			
 
				-            ocr_result = self.ocr_engine(img)
			
 
				-            ocr_result = list(
			
 
				-                zip(
			
 
				-                    ocr_result.boxes,
			
 
				-                    ocr_result.txts,
			
 
				-                    ocr_result.scores,
			
 
				-                )
			
 
				-            )
			
 
				         dt_boxes, rec_res = self.get_boxes_recs(ocr_result, h, w)
			
 
				 
			
 
				-        pred_structures, cell_bboxes, _ = self.table_structure(copy.deepcopy(img))
			
 
				+        pred_structures, cell_bboxes, _ = self.table_structure.process(
			
 
				+            copy.deepcopy(img)
			
 
				+        )
			
 
				 
			
 
				         # 适配slanet-plus模型输出的box缩放还原
			
 
				-        if self.model_type == ModelType.SLANETPLUS.value:
			
 
				-            cell_bboxes = self.adapt_slanet_plus(img, cell_bboxes)
			
 
				+        cell_bboxes = self.adapt_slanet_plus(img, cell_bboxes)
			
 
				 
			
 
				         pred_html = self.table_matcher(pred_structures, cell_bboxes, dt_boxes, rec_res)
			
 
				 
			
@@ -117,6 +104,50 @@ class RapidTable:
 
				         elapse = time.perf_counter() - s
			
 
				         return RapidTableOutput(pred_html, cell_bboxes, logic_points, elapse)
			
 
				 
			
 
				+    def batch_predict(
			
 
				+        self,
			
 
				+        images: List[np.ndarray],
			
 
				+        ocr_results: List[List[Union[List[List[float]], str, str]]],
			
 
				+        batch_size: int = 4,
			
 
				+    ) -> List[RapidTableOutput]:
			
 
				+        """批量处理图像"""
			
 
				+        s = time.perf_counter()
			
 
				+
			
 
				+        batch_dt_boxes = []
			
 
				+        batch_rec_res = []
			
 
				+
			
 
				+        for i, img in enumerate(images):
			
 
				+            h, w = img.shape[:2]
			
 
				+            dt_boxes, rec_res = self.get_boxes_recs(ocr_results[i], h, w)
			
 
				+            batch_dt_boxes.append(dt_boxes)
			
 
				+            batch_rec_res.append(rec_res)
			
 
				+
			
 
				+        # 批量表格结构识别
			
 
				+        batch_results = self.table_structure.batch_process(images)
			
 
				+
			
 
				+        output_results = []
			
 
				+        for i, (img, ocr_result, (pred_structures, cell_bboxes, _)) in enumerate(
			
 
				+            zip(images, ocr_results, batch_results)
			
 
				+        ):
			
 
				+            # 适配slanet-plus模型输出的box缩放还原
			
 
				+            cell_bboxes = self.adapt_slanet_plus(img, cell_bboxes)
			
 
				+            pred_html = self.table_matcher(
			
 
				+                pred_structures, cell_bboxes, batch_dt_boxes[i], batch_rec_res[i]
			
 
				+            )
			
 
				+            # 过滤掉占位的bbox
			
 
				+            mask = ~np.all(cell_bboxes == 0, axis=1)
			
 
				+            cell_bboxes = cell_bboxes[mask]
			
 
				+
			
 
				+            logic_points = self.table_matcher.decode_logic_points(pred_structures)
			
 
				+            result = RapidTableOutput(pred_html, cell_bboxes, logic_points, 0)
			
 
				+            output_results.append(result)
			
 
				+
			
 
				+        total_elapse = time.perf_counter() - s
			
 
				+        for result in output_results:
			
 
				+            result.elapse = total_elapse / len(output_results)
			
 
				+
			
 
				+        return output_results
			
 
				+
			
 
				     def get_boxes_recs(
			
 
				         self, ocr_result: List[Union[List[List[float]], str, str]], h: int, w: int
			
 
				     ) -> Tuple[np.ndarray, Tuple[str, str]]:
			
@@ -201,7 +232,7 @@ class RapidTableModel(object):
 
				 
			
 
				         if ocr_result:
			
 
				             try:
			
 
				-                table_results = self.table_model(np.asarray(image), ocr_result)
			
 
				+                table_results = self.table_model.predict(np.asarray(image), ocr_result)
			
 
				                 html_code = table_results.pred_html
			
 
				                 table_cell_bboxes = table_results.cell_bboxes
			
 
				                 logic_points = table_results.logic_points
			
@@ -211,3 +242,37 @@ class RapidTableModel(object):
 
				                 logger.exception(e)
			
 
				 
			
 
				         return None, None, None, None
			
 
				+
			
 
				+    def batch_predict(self, table_res_list: List[Dict], batch_size: int = 4) -> None:
			
 
				+        """对传入的字典列表进行批量预测，无返回值"""
			
 
				+        for index in tqdm(
			
 
				+            range(0, len(table_res_list), batch_size),
			
 
				+            desc=f"Table Batch Predict, total={len(table_res_list)}, batch_size={batch_size}",
			
 
				+        ):
			
 
				+            batch_imgs = [
			
 
				+                cv2.cvtColor(np.asarray(table_res_list[i]["table_img"]), cv2.COLOR_RGB2BGR)
			
 
				+                for i in range(index, min(index + batch_size, len(table_res_list)))
			
 
				+            ]
			
 
				+            batch_ocrs = [
			
 
				+                table_res_list[i]["ocr_result"]
			
 
				+                for i in range(index, min(index + batch_size, len(table_res_list)))
			
 
				+            ]
			
 
				+            results = self.table_model.batch_predict(
			
 
				+                batch_imgs, batch_ocrs, batch_size=batch_size
			
 
				+            )
			
 
				+            for i, result in enumerate(results):
			
 
				+                if result.pred_html:
			
 
				+                    # 检查html_code是否包含'<table>'和'</table>'
			
 
				+                    if '<table>' in result.pred_html and '</table>' in result.pred_html:
			
 
				+                        # 选用<table>到</table>的内容，放入table_res_dict['table_res']['html']
			
 
				+                        start_index = result.pred_html.find('<table>')
			
 
				+                        end_index = result.pred_html.rfind('</table>') + len('</table>')
			
 
				+                        table_res_list[index + i]['table_res']['html'] = result.pred_html[start_index:end_index]
			
 
				+                    else:
			
 
				+                        logger.warning(
			
 
				+                            'table recognition processing fails, not found expected HTML table end'
			
 
				+                        )
			
 
				+                else:
			
 
				+                    logger.warning(
			
 
				+                        "table recognition processing fails, not get html return"
			
 
				+                    )
			
--- a/mineru/model/table/rec/slanet_plus/table_structure.py
+++ b/mineru/model/table/rec/slanet_plus/table_structure.py
@@ -12,23 +12,29 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 import time
			
 
				-from typing import Any, Dict
			
 
				+from typing import Any, Dict, List, Tuple
			
 
				 
			
 
				 import numpy as np
			
 
				 
			
 
				-from .table_stucture_utils import OrtInferSession, TableLabelDecode, TablePreprocess
			
 
				+from .table_stucture_utils import (
			
 
				+    OrtInferSession,
			
 
				+    TableLabelDecode,
			
 
				+    TablePreprocess,
			
 
				+    BatchTablePreprocess,
			
 
				+)
			
 
				 
			
 
				 
			
 
				 class TableStructurer:
			
 
				     def __init__(self, config: Dict[str, Any]):
			
 
				         self.preprocess_op = TablePreprocess()
			
 
				+        self.batch_preprocess_op = BatchTablePreprocess()
			
 
				 
			
 
				         self.session = OrtInferSession(config)
			
 
				 
			
 
				         self.character = self.session.get_metadata()
			
 
				         self.postprocess_op = TableLabelDecode(self.character)
			
 
				 
			
 
				-    def __call__(self, img):
			
 
				+    def process(self, img):
			
 
				         starttime = time.time()
			
 
				         data = {"image": img}
			
 
				         data = self.preprocess_op(data)
			
@@ -56,3 +62,48 @@ class TableStructurer:
 
				         )
			
 
				         elapse = time.time() - starttime
			
 
				         return structure_str_list, bbox_list, elapse
			
 
				+
			
 
				+    def batch_process(
			
 
				+        self, img_list: List[np.ndarray]
			
 
				+    ) -> List[Tuple[List[str], np.ndarray, float]]:
			
 
				+        """批量处理图像列表
			
 
				+        Args:
			
 
				+            img_list: 图像列表
			
 
				+        Returns:
			
 
				+            结果列表，每个元素包含 (table_struct_str, cell_bboxes, elapse)
			
 
				+        """
			
 
				+        starttime = time.perf_counter()
			
 
				+
			
 
				+        batch_data = self.batch_preprocess_op(img_list)
			
 
				+        preprocessed_images = batch_data[0]
			
 
				+        shape_lists = batch_data[1]
			
 
				+
			
 
				+        preprocessed_images = np.array(preprocessed_images)
			
 
				+        bbox_preds, struct_probs = self.session([preprocessed_images])
			
 
				+
			
 
				+        batch_size = preprocessed_images.shape[0]
			
 
				+        results = []
			
 
				+        for bbox_pred, struct_prob, shape_list in zip(
			
 
				+            bbox_preds, struct_probs, shape_lists
			
 
				+        ):
			
 
				+            preds = {
			
 
				+                "loc_preds": np.expand_dims(bbox_pred, axis=0),
			
 
				+                "structure_probs": np.expand_dims(struct_prob, axis=0),
			
 
				+            }
			
 
				+            shape_list = np.expand_dims(shape_list, axis=0)
			
 
				+            post_result = self.postprocess_op(preds, [shape_list])
			
 
				+            bbox_list = post_result["bbox_batch_list"][0]
			
 
				+            structure_str_list = post_result["structure_batch_list"][0]
			
 
				+            structure_str_list = structure_str_list[0]
			
 
				+            structure_str_list = (
			
 
				+                ["<html>", "<body>", "<table>"]
			
 
				+                + structure_str_list
			
 
				+                + ["</table>", "</body>", "</html>"]
			
 
				+            )
			
 
				+            results.append((structure_str_list, bbox_list, 0))
			
 
				+
			
 
				+        total_elapse = time.perf_counter() - starttime
			
 
				+        for i in range(len(results)):
			
 
				+            results[i] = (results[i][0], results[i][1], total_elapse / batch_size)
			
 
				+
			
 
				+        return results
			
--- a/mineru/model/table/rec/slanet_plus/table_stucture_utils.py
+++ b/mineru/model/table/rec/slanet_plus/table_stucture_utils.py
@@ -443,6 +443,35 @@ class TablePreprocess:
 
				         ]
			
 
				 
			
 
				 
			
 
				+class BatchTablePreprocess:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.preprocess = TablePreprocess()
			
 
				+
			
 
				+    def __call__(
			
 
				+        self, img_list: List[np.ndarray]
			
 
				+    ) -> Tuple[List[np.ndarray], List[List[float]]]:
			
 
				+        """批量处理图像
			
 
				+
			
 
				+        Args:
			
 
				+            img_list: 图像列表
			
 
				+
			
 
				+        Returns:
			
 
				+            预处理后的图像列表和形状信息列表
			
 
				+        """
			
 
				+        processed_imgs = []
			
 
				+        shape_lists = []
			
 
				+
			
 
				+        for img in img_list:
			
 
				+            if img is None:
			
 
				+                continue
			
 
				+            data = {"image": img}
			
 
				+            img_processed, shape_list = self.preprocess(data)
			
 
				+            processed_imgs.append(img_processed)
			
 
				+            shape_lists.append(shape_list)
			
 
				+        return processed_imgs, shape_lists
			
 
				+
			
 
				+
			
 
				 class ResizeTableImage:
			
 
				     def __init__(self, max_len, resize_bboxes=False, infer_mode=False):
			
 
				         super(ResizeTableImage, self).__init__()