Parcourir la source

Merge remote-tracking branch 'origin/master'

myhloli il y a 1 an
Parent
commit
ca737d5c15

+ 5 - 2
README_zh-CN.md

@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
 下载后请将models目录移动到空间较大的ssd磁盘目录  
 
 #### 3. 拷贝配置文件并进行配置
-在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
+在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 配置模版文件
+> ❗️务必执行以下命令将配置文件拷贝到用户目录下,否则程序将无法运行
 ```bash
 cp magic-pdf.template.json ~/magic-pdf.json
 ```
-在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
+
+在用户目录中找到magic-pdf.json文件并配置"models-dir"为[2. 下载模型权重文件](#2-下载模型权重文件)中下载的模型权重文件所在目录
+> ❗️务必正确配置模型权重文件所在目录,否则会因为找不到模型文件而导致程序无法运行
 ```json
 {
   "models-dir": "/tmp/models"

+ 17 - 1
docs/FAQ_zh_cn.md

@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
 pip install magic-pdf
 pip install unimernet==0.1.0
 pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
+pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ 
 ```
 
 ### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败
@@ -82,4 +83,19 @@ pip install paddlepaddle-gpu
 model_json 指的是通过模型分析后生成的一种有特定格式的json文件。  
 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。  
 如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。  
-参考:https://github.com/opendatalab/MinerU/issues/128
+参考:https://github.com/opendatalab/MinerU/issues/128
+
+### 10.Linux下报错:Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
+
+这种情况可以先使用pip list 检查一下自己的依赖库列表,重点确认下以下几个库有没有安装(版本不一定完全一致,有就可以)
+```bash
+opencv-contrib-python     4.6.0.66
+opencv-python             4.6.0.66
+opencv-python-headless    4.10.0.84
+paddleocr                 2.7.3
+paddlepaddle              2.6.1
+torch                     2.2.2
+torchtext                 0.17.2
+torchvision               0.17.2
+```
+如果都有的话,可能是libgl库没有安装,参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。

+ 1 - 1
magic_pdf/cli/magicpdf.py

@@ -89,7 +89,6 @@ def do_parse(
     orig_model_list = copy.deepcopy(model_list)
 
     local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
-    logger.info(f"local output dir is {local_md_dir}")
     image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
     image_dir = str(os.path.basename(local_image_dir))
 
@@ -163,6 +162,7 @@ def do_parse(
             path=f"{pdf_file_name}_content_list.json",
             mode=AbsReaderWriter.MODE_TXT,
         )
+    logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
 
 
 @click.group()

+ 24 - 6
magic_pdf/libs/config_reader.py

@@ -10,14 +10,17 @@ from loguru import logger
 
 from magic_pdf.libs.commons import parse_bucket_key
 
+# 定义配置文件名常量
+CONFIG_FILE_NAME = "magic-pdf.json"
+
 
 def read_config():
     home_dir = os.path.expanduser("~")
 
-    config_file = os.path.join(home_dir, "magic-pdf.json")
+    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
 
     if not os.path.exists(config_file):
-        raise Exception(f"{config_file} not found")
+        raise FileNotFoundError(f"{config_file} not found")
 
     with open(config_file, "r") as f:
         config = json.load(f)
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
         access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
 
     if access_key is None or secret_key is None or storage_endpoint is None:
-        raise Exception("ak, sk or endpoint not found in magic-pdf.json")
+        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
 
     # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
 
@@ -56,17 +59,32 @@ def get_bucket_name(path):
 
 def get_local_dir():
     config = read_config()
-    return config.get("temp-output-dir", "/tmp")
+    local_dir = config.get("temp-output-dir")
+    if local_dir is None:
+        logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
+        return "/tmp"
+    else:
+        return local_dir
 
 
 def get_local_models_dir():
     config = read_config()
-    return config.get("models-dir", "/tmp/models")
+    models_dir = config.get("models-dir")
+    if models_dir is None:
+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
+        return "/tmp/models"
+    else:
+        return models_dir
 
 
 def get_device():
     config = read_config()
-    return config.get("device-mode", "cpu")
+    device = config.get("device-mode")
+    if device is None:
+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
+        return "cpu"
+    else:
+        return device
 
 
 if __name__ == "__main__":

+ 41 - 34
magic_pdf/model/pdf_extract_kit.py

@@ -1,6 +1,8 @@
 from loguru import logger
 import os
 import time
+
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
     import cv2
     import yaml
@@ -17,14 +19,17 @@ try:
     import unimernet.tasks as tasks
     from unimernet.processors import load_processor
 
-    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
-    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
-    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 except ImportError as e:
     logger.exception(e)
-    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+    logger.error(
+        'Required dependency not installed, please install by \n'
+        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
     exit(1)
 
+from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
+from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+
 
 def mfd_model_init(weight):
     mfd_model = YOLO(weight)
@@ -100,6 +105,7 @@ class CustomPEKModel:
         self.device = kwargs.get("device", self.configs["config"]["device"])
         logger.info("using device: {}".format(self.device))
         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
+        logger.info("using models_dir: {}".format(models_dir))
 
         # 初始化公式识别
         if self.apply_formula:
@@ -135,34 +141,35 @@ class CustomPEKModel:
         layout_cost = round(time.time() - layout_start, 2)
         logger.info(f"layout detection cost: {layout_cost}")
 
-        # 公式检测
-        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
-        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
-            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
-            new_item = {
-                'category_id': 13 + int(cla.item()),
-                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
-                'score': round(float(conf.item()), 2),
-                'latex': '',
-            }
-            layout_res.append(new_item)
-            latex_filling_list.append(new_item)
-            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
-            mf_image_list.append(bbox_img)
-
-        # 公式识别
-        mfr_start = time.time()
-        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
-        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
-        mfr_res = []
-        for mf_img in dataloader:
-            mf_img = mf_img.to(self.device)
-            output = self.mfr_model.generate({'image': mf_img})
-            mfr_res.extend(output['pred_str'])
-        for res, latex in zip(latex_filling_list, mfr_res):
-            res['latex'] = latex_rm_whitespace(latex)
-        mfr_cost = round(time.time() - mfr_start, 2)
-        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+        if self.apply_formula:
+            # 公式检测
+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+            for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                new_item = {
+                    'category_id': 13 + int(cla.item()),
+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                    'score': round(float(conf.item()), 2),
+                    'latex': '',
+                }
+                layout_res.append(new_item)
+                latex_filling_list.append(new_item)
+                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+                mf_image_list.append(bbox_img)
+
+            # 公式识别
+            mfr_start = time.time()
+            dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
+            dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+            mfr_res = []
+            for mf_img in dataloader:
+                mf_img = mf_img.to(self.device)
+                output = self.mfr_model.generate({'image': mf_img})
+                mfr_res.extend(output['pred_str'])
+            for res, latex in zip(latex_filling_list, mfr_res):
+                res['latex'] = latex_rm_whitespace(latex)
+            mfr_cost = round(time.time() - mfr_start, 2)
+            logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
 
         # ocr识别
         if self.apply_ocr:
@@ -189,8 +196,8 @@ class CustomPEKModel:
                 paste_x = 50
                 paste_y = 50
                 # 创建一个宽高各多50的白色背景
-                new_width = xmax - xmin + paste_x*2
-                new_height = ymax - ymin + paste_y*2
+                new_width = xmax - xmin + paste_x * 2
+                new_height = ymax - ymin + paste_y * 2
                 new_image = Image.new('RGB', (new_width, new_height), 'white')
 
                 # 裁剪图像