hai 2 meses · 2fcffcb0af
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -9,7 +9,7 @@ from .model_list import AtomicModel
 
				 from ...utils.config_reader import get_formula_enable, get_table_enable
			
 
				 from ...utils.model_utils import crop_img, get_res_list_from_layout_res
			
 
				 from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
			
 
				-from ...utils.pdf_image_tools import get_crop_img
			
 
				+from ...utils.pdf_image_tools import get_crop_np_img
			
 
				 
			
 
				 YOLO_LAYOUT_BASE_BATCH_SIZE = 1
			
 
				 MFD_BASE_BATCH_SIZE = 1
			
@@ -38,29 +38,28 @@ class BatchAnalyze:
 
				         )
			
 
				         atom_model_manager = AtomModelSingleton()
			
 
				 
			
 
				-        images = [image for image, _, _ in images_with_extra_info]
			
 
				+        np_images = [np.asarray(image) for image, _, _ in images_with_extra_info]
			
 
				 
			
 
				         # doclayout_yolo
			
 
				-        layout_images = images.copy()
			
 
				 
			
 
				         images_layout_res += self.model.layout_model.batch_predict(
			
 
				-            layout_images, YOLO_LAYOUT_BASE_BATCH_SIZE
			
 
				+            np_images, YOLO_LAYOUT_BASE_BATCH_SIZE
			
 
				         )
			
 
				 
			
 
				         if self.formula_enable:
			
 
				             # 公式检测
			
 
				             images_mfd_res = self.model.mfd_model.batch_predict(
			
 
				-                images, MFD_BASE_BATCH_SIZE
			
 
				+                np_images, MFD_BASE_BATCH_SIZE
			
 
				             )
			
 
				 
			
 
				             # 公式识别
			
 
				             images_formula_list = self.model.mfr_model.batch_predict(
			
 
				                 images_mfd_res,
			
 
				-                images,
			
 
				+                np_images,
			
 
				                 batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE,
			
 
				             )
			
 
				             mfr_count = 0
			
 
				-            for image_index in range(len(images)):
			
 
				+            for image_index in range(len(np_images)):
			
 
				                 images_layout_res[image_index] += images_formula_list[image_index]
			
 
				                 mfr_count += len(images_formula_list[image_index])
			
 
				 
			
@@ -69,10 +68,10 @@ class BatchAnalyze:
 
				 
			
 
				         ocr_res_list_all_page = []
			
 
				         table_res_list_all_page = []
			
 
				-        for index in range(len(images)):
			
 
				+        for index in range(len(np_images)):
			
 
				             _, ocr_enable, _lang = images_with_extra_info[index]
			
 
				             layout_res = images_layout_res[index]
			
 
				-            pil_img = images[index]
			
 
				+            np_img = np_images[index]
			
 
				 
			
 
				             ocr_res_list, table_res_list, single_page_mfdetrec_res = (
			
 
				                 get_res_list_from_layout_res(layout_res)
			
@@ -81,7 +80,7 @@ class BatchAnalyze:
 
				             ocr_res_list_all_page.append({'ocr_res_list':ocr_res_list,
			
 
				                                           'lang':_lang,
			
 
				                                           'ocr_enable':ocr_enable,
			
 
				-                                          'pil_img':pil_img,
			
 
				+                                          'np_img':np_img,
			
 
				                                           'single_page_mfdetrec_res':single_page_mfdetrec_res,
			
 
				                                           'layout_res':layout_res,
			
 
				                                           })
			
@@ -93,7 +92,7 @@ class BatchAnalyze:
 
				                 crop_xmin, crop_ymin = int(table_res['poly'][0]), int(table_res['poly'][1])
			
 
				                 crop_xmax, crop_ymax = int(table_res['poly'][4]), int(table_res['poly'][5])
			
 
				                 bbox = (int(crop_xmin/scale), int(crop_ymin/scale), int(crop_xmax/scale), int(crop_ymax/scale))
			
 
				-                table_img = get_crop_img(bbox, pil_img, scale=scale)
			
 
				+                table_img = get_crop_np_img(bbox, np_img, scale=scale)
			
 
				 
			
 
				                 table_res_list_all_page.append({'table_res':table_res,
			
 
				                                                 'lang':_lang,
			
@@ -111,17 +110,17 @@ class BatchAnalyze:
 
				 
			
 
				                 for res in ocr_res_list_dict['ocr_res_list']:
			
 
				                     new_image, useful_list = crop_img(
			
 
				-                        res, ocr_res_list_dict['pil_img'], crop_paste_x=50, crop_paste_y=50
			
 
				+                        res, ocr_res_list_dict['np_img'], crop_paste_x=50, crop_paste_y=50
			
 
				                     )
			
 
				                     adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
			
 
				                         ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
			
 
				                     )
			
 
				 
			
 
				                     # BGR转换
			
 
				-                    new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
			
 
				+                    bgr_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
			
 
				 
			
 
				                     all_cropped_images_info.append((
			
 
				-                        new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang
			
 
				+                        bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang
			
 
				                     ))
			
 
				 
			
 
				             # 按语言分组
			
@@ -186,7 +185,7 @@ class BatchAnalyze:
 
				 
			
 
				                     # 处理批处理结果
			
 
				                     for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)):
			
 
				-                        new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
			
 
				+                        bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
			
 
				 
			
 
				                         if dt_boxes is not None and len(dt_boxes) > 0:
			
 
				                             # 直接应用原始OCR流程中的关键处理步骤
			
@@ -217,7 +216,7 @@ class BatchAnalyze:
 
				 
			
 
				                             if ocr_res:
			
 
				                                 ocr_result_list = get_ocr_result_list(
			
 
				-                                    ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang
			
 
				+                                    ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], bgr_image, _lang
			
 
				                                 )
			
 
				 
			
 
				                                 ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
@@ -235,21 +234,21 @@ class BatchAnalyze:
 
				                 )
			
 
				                 for res in ocr_res_list_dict['ocr_res_list']:
			
 
				                     new_image, useful_list = crop_img(
			
 
				-                        res, ocr_res_list_dict['pil_img'], crop_paste_x=50, crop_paste_y=50
			
 
				+                        res, ocr_res_list_dict['np_img'], crop_paste_x=50, crop_paste_y=50
			
 
				                     )
			
 
				                     adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
			
 
				                         ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
			
 
				                     )
			
 
				                     # OCR-det
			
 
				-                    new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
			
 
				+                    bgr_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
			
 
				                     ocr_res = ocr_model.ocr(
			
 
				-                        new_image, mfd_res=adjusted_mfdetrec_res, rec=False
			
 
				+                        bgr_image, mfd_res=adjusted_mfdetrec_res, rec=False
			
 
				                     )[0]
			
 
				 
			
 
				                     # Integration results
			
 
				                     if ocr_res:
			
 
				                         ocr_result_list = get_ocr_result_list(
			
 
				-                            ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],new_image, _lang
			
 
				+                            ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],bgr_image, _lang
			
 
				                         )
			
 
				 
			
 
				                         ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
@@ -273,7 +272,7 @@ class BatchAnalyze:
 
				                     )
			
 
				                     rotate_label = "0"
			
 
				 
			
 
				-                np_table_img = np.asarray(table_res_dict["table_img"])
			
 
				+                np_table_img = table_res_dict["table_img"]
			
 
				                 if rotate_label == "270":
			
 
				                     np_table_img = cv2.rotate(np_table_img, cv2.ROTATE_90_CLOCKWISE)
			
 
				                 elif rotate_label == "90":
			
--- a/mineru/model/mfr/unimernet/Unimernet.py
+++ b/mineru/model/mfr/unimernet/Unimernet.py
@@ -70,7 +70,7 @@ class UnimernetModel(object):
 
				         # Collect images with their original indices
			
 
				         for image_index in range(len(images_mfd_res)):
			
 
				             mfd_res = images_mfd_res[image_index]
			
 
				-            pil_img = images[image_index]
			
 
				+            image = images[image_index]
			
 
				             formula_list = []
			
 
				 
			
 
				             for idx, (xyxy, conf, cla) in enumerate(zip(
			
@@ -84,7 +84,7 @@ class UnimernetModel(object):
 
				                     "latex": "",
			
 
				                 }
			
 
				                 formula_list.append(new_item)
			
 
				-                bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
			
 
				+                bbox_img = image[ymin:ymax, xmin:xmax]
			
 
				                 area = (xmax - xmin) * (ymax - ymin)
			
 
				 
			
 
				                 curr_idx = len(mf_image_list)
			
--- a/mineru/utils/ocr_utils.py
+++ b/mineru/utils/ocr_utils.py
@@ -330,10 +330,10 @@ def get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list):
 
				     return adjusted_mfdetrec_res
			
 
				 
			
 
				 
			
 
				-def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
			
 
				+def get_ocr_result_list(ocr_res, useful_list, ocr_enable, bgr_image, lang):
			
 
				     paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
			
 
				     ocr_result_list = []
			
 
				-    ori_im = new_image.copy()
			
 
				+    ori_im = bgr_image.copy()
			
 
				     for box_ocr_res in ocr_res:
			
 
				 
			
 
				         if len(box_ocr_res) == 2:
			
--- a/mineru/utils/pdf_image_tools.py
+++ b/mineru/utils/pdf_image_tools.py
@@ -1,6 +1,7 @@
 
				 # Copyright (c) Opendatalab. All rights reserved.
			
 
				 from io import BytesIO
			
 
				 
			
 
				+import numpy as np
			
 
				 import pypdfium2 as pdfium
			
 
				 from loguru import logger
			
 
				 from PIL import Image
			
@@ -91,6 +92,24 @@ def get_crop_img(bbox: tuple, pil_img, scale=2):
 
				     return pil_img.crop(scale_bbox)
			
 
				 
			
 
				 
			
 
				+def get_crop_np_img(bbox: tuple, input_img, scale=2):
			
 
				+
			
 
				+    if isinstance(input_img, Image.Image):
			
 
				+        np_img = np.asarray(input_img)
			
 
				+    elif isinstance(input_img, np.ndarray):
			
 
				+        np_img = input_img
			
 
				+    else:
			
 
				+        raise ValueError("Input must be a pillow object or a numpy array.")
			
 
				+
			
 
				+    scale_bbox = (
			
 
				+        int(bbox[0] * scale),
			
 
				+        int(bbox[1] * scale),
			
 
				+        int(bbox[2] * scale),
			
 
				+        int(bbox[3] * scale),
			
 
				+    )
			
 
				+
			
 
				+    return np_img[scale_bbox[1]:scale_bbox[3], scale_bbox[0]:scale_bbox[2]]
			
 
				+
			
 
				 def images_bytes_to_pdf_bytes(image_bytes):
			
 
				     # 内存缓冲区
			
 
				     pdf_buffer = BytesIO()