hai 10 meses · 52efe94da8
--- a/demo/demo.py
+++ b/demo/demo.py
@@ -5,7 +5,6 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData
 
				 from magic_pdf.data.dataset import PymuDocDataset
			
 
				 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				 from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				-from magic_pdf.config.make_content_config import DropMode, MakeMode
			
 
				 
			
 
				 # args
			
 
				 pdf_file_name = "demo1.pdf"  # replace with the real pdf path
			
@@ -54,17 +53,20 @@ pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.
 
				 ### draw spans result on each page
			
 
				 pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
			
 
				 
			
 
				+### get markdown content
			
 
				+md_content = pipe_result.get_markdown(image_dir)
			
 
				+
			
 
				 ### dump markdown
			
 
				 pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				 
			
 
				+### get content list content
			
 
				+content_list_content = pipe_result.get_content_list(image_dir)
			
 
				+
			
 
				 ### dump content list
			
 
				 pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
			
 
				 
			
 
				-### get markdown content
			
 
				-md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD)
			
 
				-
			
 
				-### get content list content
			
 
				-content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
			
 
				-
			
 
				 ### get middle json
			
 
				 middle_json_content = pipe_result.get_middle_json()
			
 
				+
			
 
				+### dump middle json
			
 
				+pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
			
--- a/demo/demo1.json
+++ b/demo/demo1.json
--- a/demo/demo2.json
+++ b/demo/demo2.json
--- a/demo/small_ocr.json
+++ b/demo/small_ocr.json
--- a/magic_pdf/operators/pipes.py
+++ b/magic_pdf/operators/pipes.py
@@ -26,14 +26,14 @@ class PipeResult:
 
				     def get_markdown(
			
 
				         self,
			
 
				         img_dir_or_bucket_prefix: str,
			
 
				-        drop_mode=DropMode.WHOLE_PDF,
			
 
				+        drop_mode=DropMode.NONE,
			
 
				         md_make_mode=MakeMode.MM_MD,
			
 
				     ) -> str:
			
 
				         """Get markdown content.
			
 
				 
			
 
				         Args:
			
 
				             img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
			
 
				+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
			
 
				             md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
			
 
				 
			
 
				         Returns:
			
@@ -50,7 +50,7 @@ class PipeResult:
 
				         writer: DataWriter,
			
 
				         file_path: str,
			
 
				         img_dir_or_bucket_prefix: str,
			
 
				-        drop_mode=DropMode.WHOLE_PDF,
			
 
				+        drop_mode=DropMode.NONE,
			
 
				         md_make_mode=MakeMode.MM_MD,
			
 
				     ):
			
 
				         """Dump The Markdown.
			
@@ -59,7 +59,7 @@ class PipeResult:
 
				             writer (DataWriter): File writer handle
			
 
				             file_path (str): The file location of markdown
			
 
				             img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
			
 
				+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
			
 
				             md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
			
 
				         """
			
 
				 
			
@@ -72,14 +72,12 @@ class PipeResult:
 
				         self,
			
 
				         image_dir_or_bucket_prefix: str,
			
 
				         drop_mode=DropMode.NONE,
			
 
				-        md_make_mode=MakeMode.STANDARD_FORMAT,
			
 
				     ) -> str:
			
 
				         """Get Content List.
			
 
				 
			
 
				         Args:
			
 
				             image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				             drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
			
 
				-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
			
 
				 
			
 
				         Returns:
			
 
				             str: content list content
			
@@ -87,7 +85,7 @@ class PipeResult:
 
				         pdf_info_list = self._pipe_res['pdf_info']
			
 
				         content_list = union_make(
			
 
				             pdf_info_list,
			
 
				-            md_make_mode,
			
 
				+            MakeMode.STANDARD_FORMAT,
			
 
				             drop_mode,
			
 
				             image_dir_or_bucket_prefix,
			
 
				         )
			
@@ -99,7 +97,6 @@ class PipeResult:
 
				         file_path: str,
			
 
				         image_dir_or_bucket_prefix: str,
			
 
				         drop_mode=DropMode.NONE,
			
 
				-        md_make_mode=MakeMode.STANDARD_FORMAT,
			
 
				     ):
			
 
				         """Dump Content List.
			
 
				 
			
@@ -108,10 +105,9 @@ class PipeResult:
 
				             file_path (str): The file location of content list
			
 
				             image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				             drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
			
 
				-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
			
 
				         """
			
 
				         content_list = self.get_content_list(
			
 
				-            image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
			
 
				+            image_dir_or_bucket_prefix, drop_mode=drop_mode,
			
 
				         )
			
 
				         writer.write_string(
			
 
				             file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
			
--- a/next_docs/en/user_guide/usage/api.rst
+++ b/next_docs/en/user_guide/usage/api.rst
@@ -17,7 +17,6 @@ Local File Example
 
				     from magic_pdf.data.dataset import PymuDocDataset
			
 
				     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				     from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				-    from magic_pdf.config.make_content_config import DropMode, MakeMode
			
 
				 
			
 
				     # args
			
 
				     pdf_file_name = "abc.pdf"  # replace with the real pdf path
			
@@ -66,21 +65,24 @@ Local File Example
 
				     ### draw spans result on each page
			
 
				     pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
			
 
				 
			
 
				+    ### get markdown content
			
 
				+    md_content = pipe_result.get_markdown(image_dir)
			
 
				+
			
 
				     ### dump markdown
			
 
				     pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				 
			
 
				+    ### get content list content
			
 
				+    content_list_content = pipe_result.get_content_list(image_dir)
			
 
				+
			
 
				     ### dump content list
			
 
				     pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
			
 
				 
			
 
				-    ### get markdown content
			
 
				-    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
			
 
				-
			
 
				-    ### get content list content
			
 
				-    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 
			
 
				-
			
 
				     ### get middle json
			
 
				     middle_json_content = pipe_result.get_middle_json()
			
 
				 
			
 
				+    ### dump middle json
			
 
				+    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
			
 
				+
			
 
				 
			
 
				 
			
 
				 S3 File Example
			
@@ -93,7 +95,6 @@ S3 File Example
 
				     from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
			
 
				     from magic_pdf.data.dataset import PymuDocDataset
			
 
				     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				-    from magic_pdf.config.make_content_config import DropMode, MakeMode
			
 
				     from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				 
			
 
				     bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
			
@@ -157,15 +158,16 @@ S3 File Example
 
				     pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
			
 
				 
			
 
				     ### get markdown content
			
 
				-    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
			
 
				+    md_content = pipe_result.get_markdown(image_dir)
			
 
				 
			
 
				     ### get content list content
			
 
				-    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 
			
 
				+    content_list_content = pipe_result.get_content_list(image_dir)
			
 
				 
			
 
				     ### get middle json
			
 
				     middle_json_content = pipe_result.get_middle_json()
			
 
				 
			
 
				-
			
 
				+    ### dump middle json
			
 
				+    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
			
 
				 
			
 
				 MS-Office
			
 
				 ----------