il y a 1 an · ca737d5c15
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -121,11 +121,14 @@ pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
 
				 下载后请将models目录移动到空间较大的ssd磁盘目录  
			
 
				 
			
 
				 #### 3. 拷贝配置文件并进行配置
			
 
				-在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
			
 
				+在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 配置模版文件
			
 
				+> ❗️务必执行以下命令将配置文件拷贝到用户目录下，否则程序将无法运行
			
 
				 ```bash
			
 
				 cp magic-pdf.template.json ~/magic-pdf.json
			
 
				 ```
			
 
				-在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
			
 
				+
			
 
				+在用户目录中找到magic-pdf.json文件并配置"models-dir"为[2. 下载模型权重文件](#2-下载模型权重文件)中下载的模型权重文件所在目录
			
 
				+> ❗️务必正确配置模型权重文件所在目录，否则会因为找不到模型文件而导致程序无法运行
			
 
				 ```json
			
 
				 {
			
 
				   "models-dir": "/tmp/models"
			
--- a/docs/FAQ_zh_cn.md
+++ b/docs/FAQ_zh_cn.md
@@ -22,6 +22,7 @@ pip install magic-pdf[full-cpu]
 
				 pip install magic-pdf
			
 
				 pip install unimernet==0.1.0
			
 
				 pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
			
 
				+pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ 
			
 
				 ```
			
 
				 
			
 
				 ### 4.在部分较新的M芯片macOS设备上，MPS加速开启失败
			
@@ -82,4 +83,19 @@ pip install paddlepaddle-gpu
 
				 model_json 指的是通过模型分析后生成的一种有特定格式的json文件。  
			
 
				 如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成，该文件一般在项目的output目录下。  
			
 
				 如果使用 MinerU 的命令行调用内置的模型分析，该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。  
			
 
				-参考：https://github.com/opendatalab/MinerU/issues/128
			
 
				+参考：https://github.com/opendatalab/MinerU/issues/128
			
 
				+
			
 
				+### 10.Linux下报错：Required dependency not installed, please install by "pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"
			
 
				+
			
 
				+这种情况可以先使用pip list 检查一下自己的依赖库列表，重点确认下以下几个库有没有安装（版本不一定完全一致，有就可以）
			
 
				+```bash
			
 
				+opencv-contrib-python     4.6.0.66
			
 
				+opencv-python             4.6.0.66
			
 
				+opencv-python-headless    4.10.0.84
			
 
				+paddleocr                 2.7.3
			
 
				+paddlepaddle              2.6.1
			
 
				+torch                     2.2.2
			
 
				+torchtext                 0.17.2
			
 
				+torchvision               0.17.2
			
 
				+```
			
 
				+如果都有的话，可能是libgl库没有安装，参考 https://github.com/opendatalab/MinerU/issues/165#issuecomment-2245202282 安装libgl库后再试试能不能正常使用。
			
--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -89,7 +89,6 @@ def do_parse(
 
				     orig_model_list = copy.deepcopy(model_list)
			
 
				 
			
 
				     local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
			
 
				-    logger.info(f"local output dir is {local_md_dir}")
			
 
				     image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
			
 
				     image_dir = str(os.path.basename(local_image_dir))
			
 
				 
			
@@ -163,6 +162,7 @@ def do_parse(
 
				             path=f"{pdf_file_name}_content_list.json",
			
 
				             mode=AbsReaderWriter.MODE_TXT,
			
 
				         )
			
 
				+    logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
			
 
				 
			
 
				 
			
 
				 @click.group()
			
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -10,14 +10,17 @@ from loguru import logger
 
				 
			
 
				 from magic_pdf.libs.commons import parse_bucket_key
			
 
				 
			
 
				+# 定义配置文件名常量
			
 
				+CONFIG_FILE_NAME = "magic-pdf.json"
			
 
				+
			
 
				 
			
 
				 def read_config():
			
 
				     home_dir = os.path.expanduser("~")
			
 
				 
			
 
				-    config_file = os.path.join(home_dir, "magic-pdf.json")
			
 
				+    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
			
 
				 
			
 
				     if not os.path.exists(config_file):
			
 
				-        raise Exception(f"{config_file} not found")
			
 
				+        raise FileNotFoundError(f"{config_file} not found")
			
 
				 
			
 
				     with open(config_file, "r") as f:
			
 
				         config = json.load(f)
			
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
 
				         access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
			
 
				 
			
 
				     if access_key is None or secret_key is None or storage_endpoint is None:
			
 
				-        raise Exception("ak, sk or endpoint not found in magic-pdf.json")
			
 
				+        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
			
 
				 
			
 
				     # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
			
 
				 
			
@@ -56,17 +59,32 @@ def get_bucket_name(path):
 
				 
			
 
				 def get_local_dir():
			
 
				     config = read_config()
			
 
				-    return config.get("temp-output-dir", "/tmp")
			
 
				+    local_dir = config.get("temp-output-dir")
			
 
				+    if local_dir is None:
			
 
				+        logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
			
 
				+        return "/tmp"
			
 
				+    else:
			
 
				+        return local_dir
			
 
				 
			
 
				 
			
 
				 def get_local_models_dir():
			
 
				     config = read_config()
			
 
				-    return config.get("models-dir", "/tmp/models")
			
 
				+    models_dir = config.get("models-dir")
			
 
				+    if models_dir is None:
			
 
				+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
			
 
				+        return "/tmp/models"
			
 
				+    else:
			
 
				+        return models_dir
			
 
				 
			
 
				 
			
 
				 def get_device():
			
 
				     config = read_config()
			
 
				-    return config.get("device-mode", "cpu")
			
 
				+    device = config.get("device-mode")
			
 
				+    if device is None:
			
 
				+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
			
 
				+        return "cpu"
			
 
				+    else:
			
 
				+        return device
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -1,6 +1,8 @@
 
				 from loguru import logger
			
 
				 import os
			
 
				 import time
			
 
				+
			
 
				+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
			
 
				 try:
			
 
				     import cv2
			
 
				     import yaml
			
@@ -17,14 +19,17 @@ try:
 
				     import unimernet.tasks as tasks
			
 
				     from unimernet.processors import load_processor
			
 
				 
			
 
				-    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
			
 
				-    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
			
 
				-    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
			
 
				 except ImportError as e:
			
 
				     logger.exception(e)
			
 
				-    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
			
 
				+    logger.error(
			
 
				+        'Required dependency not installed, please install by \n'
			
 
				+        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
			
 
				     exit(1)
			
 
				 
			
 
				+from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
			
 
				+from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
			
 
				+from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
			
 
				+
			
 
				 
			
 
				 def mfd_model_init(weight):
			
 
				     mfd_model = YOLO(weight)
			
@@ -100,6 +105,7 @@ class CustomPEKModel:
 
				         self.device = kwargs.get("device", self.configs["config"]["device"])
			
 
				         logger.info("using device: {}".format(self.device))
			
 
				         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
			
 
				+        logger.info("using models_dir: {}".format(models_dir))
			
 
				 
			
 
				         # 初始化公式识别
			
 
				         if self.apply_formula:
			
@@ -135,34 +141,35 @@ class CustomPEKModel:
 
				         layout_cost = round(time.time() - layout_start, 2)
			
 
				         logger.info(f"layout detection cost: {layout_cost}")
			
 
				 
			
 
				-        # 公式检测
			
 
				-        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
			
 
				-        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
			
 
				-            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
			
 
				-            new_item = {
			
 
				-                'category_id': 13 + int(cla.item()),
			
 
				-                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
			
 
				-                'score': round(float(conf.item()), 2),
			
 
				-                'latex': '',
			
 
				-            }
			
 
				-            layout_res.append(new_item)
			
 
				-            latex_filling_list.append(new_item)
			
 
				-            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
			
 
				-            mf_image_list.append(bbox_img)
			
 
				-
			
 
				-        # 公式识别
			
 
				-        mfr_start = time.time()
			
 
				-        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
			
 
				-        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
			
 
				-        mfr_res = []
			
 
				-        for mf_img in dataloader:
			
 
				-            mf_img = mf_img.to(self.device)
			
 
				-            output = self.mfr_model.generate({'image': mf_img})
			
 
				-            mfr_res.extend(output['pred_str'])
			
 
				-        for res, latex in zip(latex_filling_list, mfr_res):
			
 
				-            res['latex'] = latex_rm_whitespace(latex)
			
 
				-        mfr_cost = round(time.time() - mfr_start, 2)
			
 
				-        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
			
 
				+        if self.apply_formula:
			
 
				+            # 公式检测
			
 
				+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
			
 
				+            for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
			
 
				+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
			
 
				+                new_item = {
			
 
				+                    'category_id': 13 + int(cla.item()),
			
 
				+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
			
 
				+                    'score': round(float(conf.item()), 2),
			
 
				+                    'latex': '',
			
 
				+                }
			
 
				+                layout_res.append(new_item)
			
 
				+                latex_filling_list.append(new_item)
			
 
				+                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
			
 
				+                mf_image_list.append(bbox_img)
			
 
				+
			
 
				+            # 公式识别
			
 
				+            mfr_start = time.time()
			
 
				+            dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
			
 
				+            dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
			
 
				+            mfr_res = []
			
 
				+            for mf_img in dataloader:
			
 
				+                mf_img = mf_img.to(self.device)
			
 
				+                output = self.mfr_model.generate({'image': mf_img})
			
 
				+                mfr_res.extend(output['pred_str'])
			
 
				+            for res, latex in zip(latex_filling_list, mfr_res):
			
 
				+                res['latex'] = latex_rm_whitespace(latex)
			
 
				+            mfr_cost = round(time.time() - mfr_start, 2)
			
 
				+            logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
			
 
				 
			
 
				         # ocr识别
			
 
				         if self.apply_ocr:
			
@@ -189,8 +196,8 @@ class CustomPEKModel:
 
				                 paste_x = 50
			
 
				                 paste_y = 50
			
 
				                 # 创建一个宽高各多50的白色背景
			
 
				-                new_width = xmax - xmin + paste_x*2
			
 
				-                new_height = ymax - ymin + paste_y*2
			
 
				+                new_width = xmax - xmin + paste_x * 2
			
 
				+                new_height = ymax - ymin + paste_y * 2
			
 
				                 new_image = Image.new('RGB', (new_width, new_height), 'white')
			
 
				 
			
 
				                 # 裁剪图像