Jelajahi Sumber

refactor: reorganize imports to align with backend structure and improve clarity

myhloli 5 bulan lalu
induk
melakukan
9bb257769c

+ 0 - 1
mineru/api/__init__.py

@@ -1 +0,0 @@
-# Copyright (c) Opendatalab. All rights reserved.

+ 1 - 1
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -12,7 +12,7 @@ from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.cut_image import cut_image_and_table
 from mineru.utils.llm_aided import llm_aided_title
 from mineru.utils.model_utils import clean_memory
-from mineru.utils.pipeline_magic_model import MagicModel
+from mineru.backend.pipeline.pipeline_magic_model import MagicModel
 from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
 from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
     remove_overlaps_min_spans, txt_spans_extract

+ 0 - 0
mineru/utils/pipeline_magic_model.py → mineru/backend/pipeline/pipeline_magic_model.py


+ 0 - 0
mineru/api/pipeline_middle_json_mkcontent.py → mineru/backend/pipeline/pipeline_middle_json_mkcontent.py


+ 1 - 2
mineru/backend/vlm/token_to_middle_json.py

@@ -1,10 +1,9 @@
 import re
 
-from mineru.utils.block_pre_proc import fix_text_overlap_title_blocks
 from mineru.utils.cut_image import cut_image_and_table
 from mineru.utils.enum_class import BlockType, ContentType
 from mineru.utils.hash_utils import str_md5
-from mineru.utils.vlm_magic_model import fix_two_layer_blocks, fix_title_blocks
+from mineru.backend.vlm.vlm_magic_model import fix_two_layer_blocks, fix_title_blocks
 from mineru.version import __version__
 
 

+ 3 - 3
mineru/utils/vlm_magic_model.py → mineru/backend/vlm/vlm_magic_model.py

@@ -1,9 +1,9 @@
 import re
 from typing import Literal
 
-from .boxbase import bbox_distance, is_in
-from .enum_class import BlockType
-from ..api.vlm_middle_json_mkcontent import merge_para_with_text
+from mineru.utils.boxbase import bbox_distance, is_in
+from mineru.utils.enum_class import BlockType
+from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
 
 
 def __reduct_overlap(bboxes):

+ 1 - 1
mineru/api/vlm_middle_json_mkcontent.py → mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -1,5 +1,5 @@
 import re
-from ..utils.enum_class import MakeMode, BlockType, ContentType
+from mineru.utils.enum_class import MakeMode, BlockType, ContentType
 
 
 def merge_para_with_text(para_block):

+ 4 - 4
mineru/cli/common.py

@@ -8,9 +8,9 @@ from pathlib import Path
 import pypdfium2 as pdfium
 from loguru import logger
 
-from mineru.api.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
 from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
-from mineru.api.vlm_middle_json_mkcontent import union_make as vlm_union_make
+from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
 from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
 from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
 from mineru.data.data_reader_writer import FileBasedDataWriter
@@ -215,8 +215,8 @@ def do_parse(
 
 
 if __name__ == "__main__":
-    pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
-    # pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
+    # pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
+    pdf_path = "C:/Users/zhaoxiaomeng/Downloads/数学新星问题征解第一期(2014.03).pdf"
 
     try:
        do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)

+ 1 - 1
mineru/utils/llm_aided.py

@@ -3,7 +3,7 @@ from loguru import logger
 from openai import OpenAI
 import ast
 
-from mineru.api.pipeline_middle_json_mkcontent import merge_para_with_text
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import merge_para_with_text
 
 
 def llm_aided_title(page_info_list, title_aided_config):