Sfoglia il codice sorgente

feat(model): add model mode selection for PDF analysis

Introduce a new feature that allows users to choose between a "lite" and a "full"
model mode for PDF document analysis. The "lite" mode uses a faster, less
accurate model, while the "full" mode employs a higher-precision model at the
cost of speed. This selection can be made through the CLI or API, providing
flexibility for different use cases.
myhloli 1 anno fa
parent
commit
bc0f69321a

+ 9 - 3
magic_pdf/cli/magicpdf.py

@@ -178,8 +178,10 @@ def cli():
     default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def json_command(json, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="lite", help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
+def json_command(json, method, inside_model, model_mode):
     model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
 
     if not json.startswith("s3://"):
         logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
@@ -229,8 +231,10 @@ def json_command(json, method, inside_model):
     default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def local_json_command(local_json, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="lite", help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
+def local_json_command(local_json, method, inside_model, model_mode):
     model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
 
     def read_s3_path(s3path):
         bucket, key = parse_s3path(s3path)
@@ -281,8 +285,10 @@ def local_json_command(local_json, method, inside_model):
     default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def pdf_command(pdf, model, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="lite", help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
+def pdf_command(pdf, model, method, inside_model, model_mode):
     model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
 
     def read_fn(path):
         disk_rw = DiskReaderWriter(os.path.dirname(path))

+ 1 - 0
magic_pdf/model/__init__.py

@@ -1 +1,2 @@
 __use_inside_model__ = False
+__model_mode__ = "lite"

+ 8 - 1
magic_pdf/model/doc_analyze_by_custom_model.py

@@ -46,7 +46,14 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
     return images
 
 
-def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.PEK):
+def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
+    model = None
+
+    if model_config.__model_mode__ == "lite":
+        model = MODEL.Paddle
+    elif model_config.__model_mode__ == "full":
+        model = MODEL.PEK
+
     if model_config.__use_inside_model__:
         model_init_start = time.time()
         if model == MODEL.Paddle:

+ 1 - 0
setup.py

@@ -31,6 +31,7 @@ if __name__ == '__main__':
         extras_require={
             "gpu": ["paddleocr", "paddlepaddle-gpu"],
             "cpu": ["paddleocr", "paddlepaddle"],
+            "full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr", "paddlepaddle"],
         },
         description="A practical tool for converting PDF to Markdown",  # 简短描述
         long_description=long_description,  # 详细描述