11 months ago · 7dc3b0a9a2
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
@@ -104,7 +104,7 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
 
				     shutil.rmtree(temp_dir)
			
 
				     return ret
			
 
				 
			
 
				-def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
			
 
				+def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]:
			
 
				     """Read images from path or directory.
			
 
				 
			
 
				     Args:
			
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -112,7 +112,7 @@ Download model weight files
 
				 Install LibreOffice[Optional]
			
 
				 ----------------------------------
			
 
				 
			
 
				-This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can Skip this section if no need for those filetype processing.
			
 
				+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
			
 
				 
			
 
				 
			
 
				 Linux/Macos Platform
			
--- a/next_docs/en/user_guide/quick_start.rst
+++ b/next_docs/en/user_guide/quick_start.rst
@@ -8,8 +8,8 @@ Want to learn about the usage methods under different scenarios ? This page give
 
				     :maxdepth: 1
			
 
				 
			
 
				     quick_start/convert_pdf 
			
 
				-    quick_start/convert_images
			
 
				+    quick_start/convert_image
			
 
				     quick_start/convert_ppt
			
 
				-    quick_start/convert_word 
			
 
				-    quick_start/convert_directory
			
 
				-
			
 
				+    quick_start/convert_pptx
			
 
				+    quick_start/convert_doc
			
 
				+    quick_start/convert_docx
			
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -0,0 +1,43 @@
 
				+
			
 
				+
			
 
				+Convert Word 
			
 
				+=============
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_doc.doc"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
@@ -0,0 +1,41 @@
 
				+
			
 
				+Convert DocX
			
 
				+=============
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_docx.docx"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -0,0 +1,33 @@
 
				+
			
 
				+
			
 
				+Convert Image
			
 
				+===============
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_images
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_image.jpg"       # replace with real image file
			
 
				+
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_images(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_images.rst
+++ b/next_docs/en/user_guide/quick_start/convert_images.rst
@@ -1,5 +0,0 @@
 
				-
			
 
				-
			
 
				-Convert Images 
			
 
				-================
			
 
				-
			
--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -3,3 +3,36 @@
 
				 Convert PDF 
			
 
				 ============
			
 
				 
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.data.dataset import PymuDocDataset
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+
			
 
				+    # args
			
 
				+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
			
 
				+    name_without_suff = pdf_file_name.split(".")[0]
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # read bytes
			
 
				+    reader1 = FileBasedDataReader("")
			
 
				+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    ds = PymuDocDataset(pdf_bytes)
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				+
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
@@ -3,3 +3,39 @@
 
				 Convert PPT 
			
 
				 ============
			
 
				 
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_ppt.ppt"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
@@ -0,0 +1,42 @@
 
				+
			
 
				+
			
 
				+Convert PPTX
			
 
				+=================
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_pptx.pptx"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_word.rst
+++ b/next_docs/en/user_guide/quick_start/convert_word.rst
@@ -1,6 +0,0 @@
 
				-
			
 
				-
			
 
				-Convert Word 
			
 
				-=============
			
 
				-
			
 
				-
			
--- a/next_docs/en/user_guide/tutorial/pipeline.rst
+++ b/next_docs/en/user_guide/tutorial/pipeline.rst
@@ -28,7 +28,6 @@ Minimal Example
 
				     image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				         local_md_dir
			
 
				     )
			
 
				-    image_dir = str(os.path.basename(local_image_dir))
			
 
				 
			
 
				     # read bytes
			
 
				     reader1 = FileBasedDataReader("")