Explorar o código

feat(api): simplify markdown and content list generation

- Remove DropMode and MakeMode imports from user code
- Set default drop_mode to DropMode.NONE in get_markdown and get_content_list methods
- Remove md_make_mode parameter from get_content_list method
- Add dump_middle_json method to PipeResult
- Update examples in API documentation and demo script
myhloli hai 10 meses
pai
achega
52efe94da8
Modificáronse 6 ficheiros con 28 adicións e 28 borrados
  1. 9 7
      demo/demo.py
  2. 0 0
      demo/demo1.json
  3. 0 0
      demo/demo2.json
  4. 0 0
      demo/small_ocr.json
  5. 6 10
      magic_pdf/operators/pipes.py
  6. 13 11
      next_docs/en/user_guide/usage/api.rst

+ 9 - 7
demo/demo.py

@@ -5,7 +5,6 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.config.make_content_config import DropMode, MakeMode
 
 # args
 pdf_file_name = "demo1.pdf"  # replace with the real pdf path
@@ -54,17 +53,20 @@ pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.
 ### draw spans result on each page
 pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
 
+### get markdown content
+md_content = pipe_result.get_markdown(image_dir)
+
 ### dump markdown
 pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
 
+### get content list content
+content_list_content = pipe_result.get_content_list(image_dir)
+
 ### dump content list
 pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
 
-### get markdown content
-md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD)
-
-### get content list content
-content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
-
 ### get middle json
 middle_json_content = pipe_result.get_middle_json()
+
+### dump middle json
+pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 0 - 0
demo/demo1.json


A diferenza do arquivo foi suprimida porque é demasiado grande
+ 0 - 0
demo/demo2.json


A diferenza do arquivo foi suprimida porque é demasiado grande
+ 0 - 0
demo/small_ocr.json


+ 6 - 10
magic_pdf/operators/pipes.py

@@ -26,14 +26,14 @@ class PipeResult:
     def get_markdown(
         self,
         img_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.WHOLE_PDF,
+        drop_mode=DropMode.NONE,
         md_make_mode=MakeMode.MM_MD,
     ) -> str:
         """Get markdown content.
 
         Args:
             img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
             md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
 
         Returns:
@@ -50,7 +50,7 @@ class PipeResult:
         writer: DataWriter,
         file_path: str,
         img_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.WHOLE_PDF,
+        drop_mode=DropMode.NONE,
         md_make_mode=MakeMode.MM_MD,
     ):
         """Dump The Markdown.
@@ -59,7 +59,7 @@ class PipeResult:
             writer (DataWriter): File writer handle
             file_path (str): The file location of markdown
             img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
             md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
         """
 
@@ -72,14 +72,12 @@ class PipeResult:
         self,
         image_dir_or_bucket_prefix: str,
         drop_mode=DropMode.NONE,
-        md_make_mode=MakeMode.STANDARD_FORMAT,
     ) -> str:
         """Get Content List.
 
         Args:
             image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
             drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
 
         Returns:
             str: content list content
@@ -87,7 +85,7 @@ class PipeResult:
         pdf_info_list = self._pipe_res['pdf_info']
         content_list = union_make(
             pdf_info_list,
-            md_make_mode,
+            MakeMode.STANDARD_FORMAT,
             drop_mode,
             image_dir_or_bucket_prefix,
         )
@@ -99,7 +97,6 @@ class PipeResult:
         file_path: str,
         image_dir_or_bucket_prefix: str,
         drop_mode=DropMode.NONE,
-        md_make_mode=MakeMode.STANDARD_FORMAT,
     ):
         """Dump Content List.
 
@@ -108,10 +105,9 @@ class PipeResult:
             file_path (str): The file location of content list
             image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
             drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
         """
         content_list = self.get_content_list(
-            image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
+            image_dir_or_bucket_prefix, drop_mode=drop_mode,
         )
         writer.write_string(
             file_path, json.dumps(content_list, ensure_ascii=False, indent=4)

+ 13 - 11
next_docs/en/user_guide/usage/api.rst

@@ -17,7 +17,6 @@ Local File Example
     from magic_pdf.data.dataset import PymuDocDataset
     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
     from magic_pdf.config.enums import SupportedPdfParseMethod
-    from magic_pdf.config.make_content_config import DropMode, MakeMode
 
     # args
     pdf_file_name = "abc.pdf"  # replace with the real pdf path
@@ -66,21 +65,24 @@ Local File Example
     ### draw spans result on each page
     pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
 
+    ### get markdown content
+    md_content = pipe_result.get_markdown(image_dir)
+
     ### dump markdown
     pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
 
+    ### get content list content
+    content_list_content = pipe_result.get_content_list(image_dir)
+
     ### dump content list
     pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
 
-    ### get markdown content
-    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
-
-    ### get content list content
-    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 
-
     ### get middle json
     middle_json_content = pipe_result.get_middle_json()
 
+    ### dump middle json
+    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
+
 
 
 S3 File Example
@@ -93,7 +95,6 @@ S3 File Example
     from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
     from magic_pdf.data.dataset import PymuDocDataset
     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.config.make_content_config import DropMode, MakeMode
     from magic_pdf.config.enums import SupportedPdfParseMethod
 
     bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
@@ -157,15 +158,16 @@ S3 File Example
     pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
 
     ### get markdown content
-    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
+    md_content = pipe_result.get_markdown(image_dir)
 
     ### get content list content
-    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 
+    content_list_content = pipe_result.get_content_list(image_dir)
 
     ### get middle json
     middle_json_content = pipe_result.get_middle_json()
 
-
+    ### dump middle json
+    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
 
 MS-Office
 ----------

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio