hai 11 meses · b04867f90a
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
@@ -87,14 +87,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
 
				         FileNotFoundError: File not Found
			
 
				         Exception: Unknown Exception raised
			
 
				     """
			
 
				-    suffixes = ['ppt', 'pptx', 'doc', 'docx']
			
 
				+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
			
 
				     fns = []
			
 
				     ret = []
			
 
				     if os.path.isdir(path):
			
 
				         for root, _, files in os.walk(path):
			
 
				             for file in files:
			
 
				-                suffix = file.split('.')
			
 
				-                if suffix[-1] in suffixes:
			
 
				+                suffix = Path(file).suffix
			
 
				+                if suffix in suffixes:
			
 
				                     fns.append((os.path.join(root, file)))
			
 
				     else:
			
 
				         fns.append(path)
			
@@ -116,12 +116,12 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
 
				     shutil.rmtree(temp_dir)
			
 
				     return ret
			
 
				 
			
 
				-def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]:
			
 
				+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
			
 
				     """Read images from path or directory.
			
 
				 
			
 
				     Args:
			
 
				         path (str): image file path or directory that contains image files
			
 
				-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
			
 
				+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
			
 
				 
			
 
				     Returns:
			
 
				         list[ImageDataset]: each image file will converted to a ImageDataset
			
@@ -132,8 +132,8 @@ def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[Ima
 
				         reader = FileBasedDataReader()
			
 
				         for root, _, files in os.walk(path):
			
 
				             for file in files:
			
 
				-                suffix = file.split('.')
			
 
				-                if suffix[-1] in s_suffixes:
			
 
				+                suffix = Path(file).suffix
			
 
				+                if suffix in s_suffixes:
			
 
				                     imgs_bits.append(reader.read(os.path.join(root, file)))
			
 
				         return [ImageDataset(bits) for bits in imgs_bits]
			
 
				     else:
			
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -97,7 +97,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
 
				             fn = os.path.join(temp_dir, f"{path.stem}.pdf")
			
 
				         elif path.suffix in image_suffixes:
			
 
				             with open(str(path), 'rb') as f:
			
 
				-                bits = f.read(_)
			
 
				+                bits = f.read()
			
 
				             pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
			
 
				             fn = os.path.join(temp_dir, f"{path.stem}.pdf")
			
 
				             with open(fn, 'wb') as f:
			
@@ -134,7 +134,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
 
				             if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
			
 
				                 parse_doc(doc_path)
			
 
				     else:
			
 
				-        parse_doc(path)
			
 
				+        parse_doc(Path(path))
			
 
				 
			
 
				     shutil.rmtree(temp_dir)
			
 
				 
			
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
@@ -4,8 +4,11 @@ Glossary
 
				 ===========
			
 
				 
			
 
				 1. jsonl 
			
 
				-    TODO: add description
			
 
				+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
			
 
				+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
			
 
				+
			
 
				+
			
 
				+2. magic-pdf.json 
			
 
				+    TODO
			
 
				 
			
 
				-2. magic-pdf.json
			
 
				-    TODO: add description
			
 
				 
			
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -134,6 +134,6 @@ Windows Platform
 
				 
			
 
				 .. tip::
			
 
				 
			
 
				-    The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
			
 
				+    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
			
 
				 
			
 
				 
			
--- a/next_docs/en/user_guide/quick_start/convert_directory.rst
+++ b/next_docs/en/user_guide/quick_start/convert_directory.rst
@@ -1,8 +0,0 @@
 
				-
			
 
				-
			
 
				-Convert Files Under Directory 
			
 
				-=================================
			
 
				-
			
 
				-.. code:: python 
			
 
				-
			
 
				-    
			
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -10,6 +10,19 @@ Convert Doc
 
				     
			
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				+
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.doc -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^^^^
			
 
				 .. code:: python 
			
 
				 
			
 
				     import os
			
--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
@@ -10,6 +10,18 @@ Convert DocX
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				 
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.docx -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^
			
 
				+
			
 
				 .. code:: python 
			
 
				 
			
 
				     import os
			
--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -3,6 +3,19 @@
 
				 Convert Image
			
 
				 ===============
			
 
				 
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.png -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^^
			
 
				+
			
 
				 .. code:: python
			
 
				 
			
 
				     import os
			
--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -3,6 +3,17 @@
 
				 Convert PDF 
			
 
				 ============
			
 
				 
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.pdf -o output -m auto
			
 
				+
			
 
				+
			
 
				+API
			
 
				+^^^^^^
			
 
				 .. code:: python
			
 
				 
			
 
				     import os
			
--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
@@ -10,6 +10,17 @@ Convert PPT
 
				     
			
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.ppt -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^
			
 
				 
			
 
				 .. code:: python 
			
 
				 
			
--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
@@ -11,6 +11,19 @@ Convert PPTX
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				 
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.pptx -o output -m auto
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^^
			
 
				 
			
 
				 .. code:: python 
			
 
				 
			
--- a/next_docs/en/user_guide/tutorial.rst
+++ b/next_docs/en/user_guide/tutorial.rst
@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
 
				 .. toctree::
			
 
				     :maxdepth: 1
			
 
				 
			
 
				-    tutorial/output_file_description
			
 
				     tutorial/pipeline
			
 
				 
			
--- a/next_docs/en/user_guide/usage/api.rst
+++ b/next_docs/en/user_guide/usage/api.rst
@@ -2,6 +2,10 @@
 
				 Api Usage 
			
 
				 ===========
			
 
				 
			
 
				+
			
 
				+PDF
			
 
				+----
			
 
				+
			
 
				 Local File Example
			
 
				 ^^^^^^^^^^^^^^^^^^
			
 
				 
			
@@ -111,4 +115,112 @@ S3 File Example
 
				     pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
			
 
				 
			
 
				 
			
 
				+
			
 
				+MS-Office 
			
 
				+----------
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_ppt.ppt"     # replace with real ms-office file
			
 
				+
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
 
				+This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
			
 
				+
			
 
				+
			
 
				+Image
			
 
				+---------
			
 
				+
			
 
				+Single Image File 
			
 
				+^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_images
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_image.jpg"       # replace with real image file
			
 
				+
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_images(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+Directory That Contains Images 
			
 
				+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_images
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_directory = "some_image_dir/"       # replace with real directory that contains images
			
 
				+
			
 
				+
			
 
				+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
			
 
				+
			
 
				+    count = 0
			
 
				+    for ds in dss:
			
 
				+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+            md_writer, f"{count}.md", image_dir
			
 
				+        )
			
 
				+        count += 1
			
 
				+
			
 
				+
			
 
				 Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
			
--- a/next_docs/en/user_guide/usage/command_line.rst
+++ b/next_docs/en/user_guide/usage/command_line.rst
@@ -10,7 +10,8 @@ Command Line
 
				 
			
 
				    Options:
			
 
				      -v, --version                display the version and exit
			
 
				-     -p, --path PATH              local pdf filepath or directory  [required]
			
 
				+     -p, --path PATH              local filepath or directory. support PDF, PPT,
			
 
				+                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
			
 
				      -o, --output-dir PATH        output local directory  [required]
			
 
				      -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
			
 
				                                   technique to extract information from pdf. txt:
			
@@ -40,6 +41,20 @@ Command Line
 
				    ## command line example
			
 
				    magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
			
 
				 
			
 
				+
			
 
				+.. admonition:: Important
			
 
				+    :class: tip
			
 
				+
			
 
				+    The file must endswith with the following suffix.
			
 
				+       .pdf 
			
 
				+       .png
			
 
				+       .jpg
			
 
				+       .ppt
			
 
				+       .pptx
			
 
				+       .doc
			
 
				+       .docx
			
 
				+
			
 
				+
			
 
				 ``{some_pdf}`` can be a single PDF file or a directory containing
			
 
				 multiple PDFs. The results will be saved in the ``{some_output_dir}``
			
 
				 directory. The output file list is as follows:
			
@@ -59,4 +74,4 @@ directory. The output file list is as follows:
 
				    :class: tip
			
 
				    
			
 
				 
			
 
				-   For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>`
			
 
				+   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`