há 8 meses atrás · af27c0cc81
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -256,27 +256,28 @@ def may_batch_image_analyze(
 
				     batch_ratio = 1
			
 
				     device = get_device()
			
 
				 
			
 
				-    npu_support = False
			
 
				     if str(device).startswith('npu'):
			
 
				         import torch_npu
			
 
				         if torch_npu.npu.is_available():
			
 
				-            npu_support = True
			
 
				             torch.npu.set_compile_mode(jit_compile=False)
			
 
				 
			
 
				-    if torch.cuda.is_available() and device != 'cpu' or npu_support:
			
 
				+    if str(device).startswith('npu') or str(device).startswith('cuda'):
			
 
				         gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(get_vram(device))))
			
 
				-        if gpu_memory is not None and gpu_memory >= 8:
			
 
				+        if gpu_memory is not None:
			
 
				             if gpu_memory >= 20:
			
 
				                 batch_ratio = 16
			
 
				             elif gpu_memory >= 15:
			
 
				                 batch_ratio = 8
			
 
				             elif gpu_memory >= 10:
			
 
				                 batch_ratio = 4
			
 
				-            else:
			
 
				+            elif gpu_memory >= 7:
			
 
				                 batch_ratio = 2
			
 
				-
			
 
				+            else:
			
 
				+                batch_ratio = 1
			
 
				             logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
			
 
				             batch_analyze = True
			
 
				+    elif str(device).startswith('mps'):
			
 
				+        batch_analyze = True
			
 
				     doc_analyze_start = time.time()
			
 
				 
			
 
				     if batch_analyze:
			
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -118,7 +118,7 @@ class CustomPEKModel:
 
				                 atom_model_name=AtomicModel.MFR,
			
 
				                 mfr_weight_dir=mfr_weight_dir,
			
 
				                 mfr_cfg_path=mfr_cfg_path,
			
 
				-                device='cpu' if str(self.device).startswith("mps") else self.device,
			
 
				+                device=self.device,
			
 
				             )
			
 
				 
			
 
				         # 初始化layout模型
			
--- a/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py
+++ b/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py
@@ -44,7 +44,6 @@ def split_images(image, result_images=None):
 
				             # 判断裁剪区域是否超出图片范围，如果超出则不进行裁剪保存操作
			
 
				             if x + new_long_side > width:
			
 
				                 continue
			
 
				-            box = (x, 0, x + new_long_side, height)
			
 
				             sub_image = image[0:height, x:x + new_long_side]
			
 
				             sub_images.append(sub_image)
			
 
				     else:  # 如果高度是较长边
			
@@ -52,7 +51,6 @@ def split_images(image, result_images=None):
 
				             # 判断裁剪区域是否超出图片范围，如果超出则不进行裁剪保存操作
			
 
				             if y + new_long_side > height:
			
 
				                 continue
			
 
				-            box = (0, y, width, y + new_long_side)
			
 
				             sub_image = image[y:y + new_long_side, 0:width]
			
 
				             sub_images.append(sub_image)
			
 
				 
			
--- a/magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py
+++ b/magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py
@@ -4,6 +4,8 @@ from doclayout_yolo import YOLOv10
 
				 class DocLayoutYOLOModel(object):
			
 
				     def __init__(self, weight, device):
			
 
				         self.model = YOLOv10(weight)
			
 
				+        if not device.startswith("cpu"):
			
 
				+            self.model.half()
			
 
				         self.device = device
			
 
				 
			
 
				     def predict(self, image):
			
--- a/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py
+++ b/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py
@@ -4,6 +4,8 @@ from ultralytics import YOLO
 
				 class YOLOv8MFDModel(object):
			
 
				     def __init__(self, weight, device="cpu"):
			
 
				         self.mfd_model = YOLO(weight)
			
 
				+        if not device.startswith("cpu"):
			
 
				+            self.mfd_model.half()
			
 
				         self.device = device
			
 
				 
			
 
				     def predict(self, image):
			
--- a/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py
@@ -1,13 +1,5 @@
 
				-import argparse
			
 
				-import os
			
 
				-import re
			
 
				-
			
 
				 import torch
			
 
				-import unimernet.tasks as tasks
			
 
				 from torch.utils.data import DataLoader, Dataset
			
 
				-from torchvision import transforms
			
 
				-from unimernet.common.config import Config
			
 
				-from unimernet.processors import load_processor
			
 
				 
			
 
				 
			
 
				 class MathDataset(Dataset):
			
@@ -18,46 +10,26 @@ class MathDataset(Dataset):
 
				     def __len__(self):
			
 
				         return len(self.image_paths)
			
 
				 
			
 
				-
			
 
				-def latex_rm_whitespace(s: str):
			
 
				-    """Remove unnecessary whitespace from LaTeX code."""
			
 
				-    text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})"
			
 
				-    letter = "[a-zA-Z]"
			
 
				-    noletter = "[\W_^\d]"
			
 
				-    names = [x[0].replace(" ", "") for x in re.findall(text_reg, s)]
			
 
				-    s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
			
 
				-    news = s
			
 
				-    while True:
			
 
				-        s = news
			
 
				-        news = re.sub(r"(?!\\ )(%s)\s+?(%s)" % (noletter, noletter), r"\1\2", s)
			
 
				-        news = re.sub(r"(?!\\ )(%s)\s+?(%s)" % (noletter, letter), r"\1\2", news)
			
 
				-        news = re.sub(r"(%s)\s+?(%s)" % (letter, noletter), r"\1\2", news)
			
 
				-        if news == s:
			
 
				-            break
			
 
				-    return s
			
 
				+    def __getitem__(self, idx):
			
 
				+        raw_image = self.image_paths[idx]
			
 
				+        if self.transform:
			
 
				+            image = self.transform(raw_image)
			
 
				+            return image
			
 
				 
			
 
				 
			
 
				 class UnimernetModel(object):
			
 
				     def __init__(self, weight_dir, cfg_path, _device_="cpu"):
			
 
				-        args = argparse.Namespace(cfg_path=cfg_path, options=None)
			
 
				-        cfg = Config(args)
			
 
				-        cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.pth")
			
 
				-        cfg.config.model.model_config.model_name = weight_dir
			
 
				-        cfg.config.model.tokenizer_config.path = weight_dir
			
 
				-        task = tasks.setup_task(cfg)
			
 
				-        self.model = task.build_model(cfg)
			
 
				+        from .unimernet_hf import UnimernetModel
			
 
				+        if _device_.startswith("mps"):
			
 
				+            self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
			
 
				+        else:
			
 
				+            self.model = UnimernetModel.from_pretrained(weight_dir)
			
 
				         self.device = _device_
			
 
				         self.model.to(_device_)
			
 
				+        if not _device_.startswith("cpu"):
			
 
				+            self.model = self.model.to(dtype=torch.float16)
			
 
				         self.model.eval()
			
 
				-        vis_processor = load_processor(
			
 
				-            "formula_image_eval",
			
 
				-            cfg.config.datasets.formula_rec_eval.vis_processor.eval,
			
 
				-        )
			
 
				-        self.mfr_transform = transforms.Compose(
			
 
				-            [
			
 
				-                vis_processor,
			
 
				-            ]
			
 
				-        )
			
 
				+
			
 
				 
			
 
				     def predict(self, mfd_res, image):
			
 
				         formula_list = []
			
@@ -76,16 +48,17 @@ class UnimernetModel(object):
 
				             bbox_img = image[ymin:ymax, xmin:xmax]
			
 
				             mf_image_list.append(bbox_img)
			
 
				 
			
 
				-        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
			
 
				+        dataset = MathDataset(mf_image_list, transform=self.model.transform)
			
 
				         dataloader = DataLoader(dataset, batch_size=32, num_workers=0)
			
 
				         mfr_res = []
			
 
				         for mf_img in dataloader:
			
 
				+            mf_img = mf_img.to(dtype=self.model.dtype)
			
 
				             mf_img = mf_img.to(self.device)
			
 
				             with torch.no_grad():
			
 
				                 output = self.model.generate({"image": mf_img})
			
 
				-            mfr_res.extend(output["pred_str"])
			
 
				+            mfr_res.extend(output["fixed_str"])
			
 
				         for res, latex in zip(formula_list, mfr_res):
			
 
				-            res["latex"] = latex_rm_whitespace(latex)
			
 
				+            res["latex"] = latex
			
 
				         return formula_list
			
 
				 
			
 
				     def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
			
@@ -130,22 +103,23 @@ class UnimernetModel(object):
 
				         index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
			
 
				 
			
 
				         # Create dataset with sorted images
			
 
				-        dataset = MathDataset(sorted_images, transform=self.mfr_transform)
			
 
				+        dataset = MathDataset(sorted_images, transform=self.model.transform)
			
 
				         dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
			
 
				 
			
 
				         # Process batches and store results
			
 
				         mfr_res = []
			
 
				         for mf_img in dataloader:
			
 
				+            mf_img = mf_img.to(dtype=self.model.dtype)
			
 
				             mf_img = mf_img.to(self.device)
			
 
				             with torch.no_grad():
			
 
				                 output = self.model.generate({"image": mf_img})
			
 
				-            mfr_res.extend(output["pred_str"])
			
 
				+            mfr_res.extend(output["fixed_str"])
			
 
				 
			
 
				         # Restore original order
			
 
				         unsorted_results = [""] * len(mfr_res)
			
 
				         for new_idx, latex in enumerate(mfr_res):
			
 
				             original_idx = index_mapping[new_idx]
			
 
				-            unsorted_results[original_idx] = latex_rm_whitespace(latex)
			
 
				+            unsorted_results[original_idx] = latex
			
 
				 
			
 
				         # Fill results back
			
 
				         for res, latex in zip(backfill_list, unsorted_results):
			
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
@@ -1,17 +1,15 @@
 
				 from transformers.image_processing_utils import BaseImageProcessor
			
 
				-from PIL import Image, ImageOps
			
 
				 import numpy as np
			
 
				 import cv2
			
 
				 import albumentations as alb
			
 
				 from albumentations.pytorch import ToTensorV2
			
 
				-from torchvision.transforms.functional import resize
			
 
				 
			
 
				 
			
 
				 # TODO: dereference cv2 if possible
			
 
				 class UnimerSwinImageProcessor(BaseImageProcessor):
			
 
				     def __init__(
			
 
				             self,
			
 
				-            image_size = [192, 672],
			
 
				+            image_size = (192, 672),
			
 
				         ):
			
 
				         self.input_size = [int(_) for _ in image_size]
			
 
				         assert len(self.input_size) == 2
			
@@ -27,56 +25,90 @@ class UnimerSwinImageProcessor(BaseImageProcessor):
 
				 
			
 
				     def __call__(self, item):
			
 
				         image = self.prepare_input(item)
			
 
				-        return self.transform(image=np.array(image))['image'][:1]
			
 
				+        return self.transform(image=image)['image'][:1]
			
 
				 
			
 
				     @staticmethod
			
 
				-    def crop_margin(img: Image.Image) -> Image.Image:
			
 
				-        data = np.array(img.convert("L"))
			
 
				-        data = data.astype(np.uint8)
			
 
				-        max_val = data.max()
			
 
				-        min_val = data.min()
			
 
				-        if max_val == min_val:
			
 
				+    def crop_margin_numpy(img: np.ndarray) -> np.ndarray:
			
 
				+        """Crop margins of image using NumPy operations"""
			
 
				+        # Convert to grayscale if it's a color image
			
 
				+        if len(img.shape) == 3 and img.shape[2] == 3:
			
 
				+            gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
			
 
				+        else:
			
 
				+            gray = img.copy()
			
 
				+
			
 
				+        # Normalize and threshold
			
 
				+        if gray.max() == gray.min():
			
 
				             return img
			
 
				-        data = (data - min_val) / (max_val - min_val) * 255
			
 
				-        gray = 255 * (data < 200).astype(np.uint8)
			
 
				 
			
 
				-        coords = cv2.findNonZero(gray)  # Find all non-zero points (text)
			
 
				-        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
			
 
				-        return img.crop((a, b, w + a, h + b))
			
 
				+        normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8)
			
 
				+        binary = 255 * (normalized < 200).astype(np.uint8)
			
 
				+
			
 
				+        # Find bounding box
			
 
				+        coords = cv2.findNonZero(binary)  # Find all non-zero points (text)
			
 
				+        x, y, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
			
 
				 
			
 
				-    def prepare_input(self, img: Image.Image, random_padding: bool = False):
			
 
				+        # Return cropped image
			
 
				+        return img[y:y + h, x:x + w]
			
 
				+
			
 
				+    def prepare_input(self, img, random_padding: bool = False):
			
 
				         """
			
 
				-        Convert PIL Image to tensor according to specified input_size after following steps below:
			
 
				-            - resize
			
 
				-            - rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
			
 
				-            - pad
			
 
				+        Convert PIL Image or numpy array to properly sized and padded image after:
			
 
				+            - crop margins
			
 
				+            - resize while maintaining aspect ratio
			
 
				+            - pad to target size
			
 
				         """
			
 
				         if img is None:
			
 
				-            return
			
 
				-        # crop margins
			
 
				+            return None
			
 
				+
			
 
				         try:
			
 
				-            img = self.crop_margin(img.convert("RGB"))
			
 
				-        except OSError:
			
 
				+            img = self.crop_margin_numpy(img)
			
 
				+        except Exception:
			
 
				             # might throw an error for broken files
			
 
				-            return
			
 
				+            return None
			
 
				+
			
 
				+        if img.shape[0] == 0 or img.shape[1] == 0:
			
 
				+            return None
			
 
				+
			
 
				+        # Resize while preserving aspect ratio
			
 
				+        h, w = img.shape[:2]
			
 
				+        scale = min(self.input_size[0] / h, self.input_size[1] / w)
			
 
				+        new_h, new_w = int(h * scale), int(w * scale)
			
 
				+        resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
			
 
				+
			
 
				+        # Calculate padding
			
 
				+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
			
 
				+
			
 
				+        # Create and apply padding
			
 
				+        channels = 3 if len(img.shape) == 3 else 1
			
 
				+        padded_img = np.full((self.input_size[0], self.input_size[1], channels), 255, dtype=np.uint8)
			
 
				+        padded_img[pad_height:pad_height + new_h, pad_width:pad_width + new_w] = resized_img
			
 
				+
			
 
				+        return padded_img
			
 
				+
			
 
				+    def _calculate_padding(self, new_w, new_h, random_padding):
			
 
				+        """Calculate padding values for PIL images"""
			
 
				+        delta_width = self.input_size[1] - new_w
			
 
				+        delta_height = self.input_size[0] - new_h
			
 
				+
			
 
				+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
			
 
				 
			
 
				-        if img.height == 0 or img.width == 0:
			
 
				-            return
			
 
				+        return (
			
 
				+            pad_width,
			
 
				+            pad_height,
			
 
				+            delta_width - pad_width,
			
 
				+            delta_height - pad_height,
			
 
				+        )
			
 
				+
			
 
				+    def _get_padding_values(self, new_w, new_h, random_padding):
			
 
				+        """Get padding values based on image dimensions and padding strategy"""
			
 
				+        delta_width = self.input_size[1] - new_w
			
 
				+        delta_height = self.input_size[0] - new_h
			
 
				 
			
 
				-        img = resize(img, min(self.input_size))
			
 
				-        img.thumbnail((self.input_size[1], self.input_size[0]))
			
 
				-        delta_width = self.input_size[1] - img.width
			
 
				-        delta_height = self.input_size[0] - img.height
			
 
				         if random_padding:
			
 
				             pad_width = np.random.randint(low=0, high=delta_width + 1)
			
 
				             pad_height = np.random.randint(low=0, high=delta_height + 1)
			
 
				         else:
			
 
				             pad_width = delta_width // 2
			
 
				             pad_height = delta_height // 2
			
 
				-        padding = (
			
 
				-            pad_width,
			
 
				-            pad_height,
			
 
				-            delta_width - pad_width,
			
 
				-            delta_height - pad_height,
			
 
				-        )
			
 
				-        return ImageOps.expand(img, padding)
			
 
				+
			
 
				+        return pad_width, pad_height
			
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -492,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
 
				     else:
			
 
				         return [[x0, y0, x1, y1]]
			
 
				 
			
 
				-# @measure_time
			
 
				+
			
 
				 def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
			
 
				     page_line_list = []
			
 
				 
			
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -2,7 +2,7 @@ weights:
 
				   layoutlmv3: Layout/LayoutLMv3/model_final.pth
			
 
				   doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
			
 
				   yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
			
 
				-  unimernet_small: MFR/unimernet_small_2501
			
 
				+  unimernet_small: MFR/unimernet_hf_small_2503
			
 
				   struct_eqtable: TabRec/StructEqTable
			
 
				   tablemaster: TabRec/TableMaster
			
 
				   rapid_table: TabRec/RapidTable
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,8 @@ numpy>=1.21.6,<2.0.0
 
				 pydantic>=2.7.2
			
 
				 PyMuPDF>=1.24.9,<=1.24.14
			
 
				 scikit-learn>=1.0.2
			
 
				-torch>=2.2.2
			
 
				-transformers
			
 
				+torch>=2.2.2,!=2.5.0,!=2.5.1,<=2.6.0
			
 
				+torchvision
			
 
				+transformers>=4.49.0
			
 
				 pdfminer.six==20231228
			
 
				 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
			
--- a/setup.py
+++ b/setup.py
@@ -36,25 +36,19 @@ if __name__ == '__main__':
 
				                      "paddlepaddle==3.0.0b1;platform_system=='Linux'",
			
 
				                      "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
			
 
				                      ],
			
 
				-            "full": ["unimernet==0.2.3",  # unimernet升级0.2.3,移除torchtext/eva-decord的依赖
			
 
				-                     "torch>=2.2.2,<=2.3.1",  # torch2.4.0及之后版本未测试，先卡住版本上限
			
 
				-                     "torchvision>=0.17.2,<=0.18.1",  # torchvision 受torch版本约束
			
 
				+            "full": [
			
 
				                      "matplotlib<=3.9.0;platform_system=='Windows'",  # 3.9.1及之后不提供windows的预编译包，避免一些没有编译环境的windows设备安装失败
			
 
				                      "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本，以避免无法更新导致的一些bug
			
 
				                      "ultralytics>=8.3.48",  # yolov8,公式检测
			
 
				                      "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突，需锁定2.7.3
			
 
				                      "paddlepaddle==3.0.0rc1;platform_system=='Linux' or platform_system=='Darwin'",  # 解决linux的段异常问题
			
 
				                      "paddlepaddle==2.6.1;platform_system=='Windows'",  # windows版本3.0.0效率下降，需锁定2.6.1
			
 
				-                     "struct-eqtable==0.3.2",  # 表格解析
			
 
				-                     "einops",  # struct-eqtable依赖
			
 
				-                     "accelerate",  # struct-eqtable依赖
			
 
				                      "doclayout_yolo==0.0.2b1",  # doclayout_yolo
			
 
				                      "rapidocr-paddle>=1.4.5,<2.0.0",  # rapidocr-paddle
			
 
				                      "rapidocr_onnxruntime>=1.4.4,<2.0.0",
			
 
				                      "rapid_table>=1.0.3,<2.0.0",  # rapid_table
			
 
				                      "PyYAML",  # yaml
			
 
				                      "openai",  # openai SDK
			
 
				-                     "detectron2"
			
 
				                      ],
			
 
				             "old_linux":[
			
 
				                 "albumentations<=1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统