Explorar o código

docs: check links in doc

xu rui hai 11 meses
pai
achega
b04867f90a

+ 7 - 7
magic_pdf/data/read_api.py

@@ -87,14 +87,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
         FileNotFoundError: File not Found
         Exception: Unknown Exception raised
     """
-    suffixes = ['ppt', 'pptx', 'doc', 'docx']
+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
     fns = []
     ret = []
     if os.path.isdir(path):
         for root, _, files in os.walk(path):
             for file in files:
-                suffix = file.split('.')
-                if suffix[-1] in suffixes:
+                suffix = Path(file).suffix
+                if suffix in suffixes:
                     fns.append((os.path.join(root, file)))
     else:
         fns.append(path)
@@ -116,12 +116,12 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
     shutil.rmtree(temp_dir)
     return ret
 
-def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]:
+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
     """Read images from path or directory.
 
     Args:
         path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
 
     Returns:
         list[ImageDataset]: each image file will converted to a ImageDataset
@@ -132,8 +132,8 @@ def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[Ima
         reader = FileBasedDataReader()
         for root, _, files in os.walk(path):
             for file in files:
-                suffix = file.split('.')
-                if suffix[-1] in s_suffixes:
+                suffix = Path(file).suffix
+                if suffix in s_suffixes:
                     imgs_bits.append(reader.read(os.path.join(root, file)))
         return [ImageDataset(bits) for bits in imgs_bits]
     else:

+ 2 - 2
magic_pdf/tools/cli.py

@@ -97,7 +97,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
             fn = os.path.join(temp_dir, f"{path.stem}.pdf")
         elif path.suffix in image_suffixes:
             with open(str(path), 'rb') as f:
-                bits = f.read(_)
+                bits = f.read()
             pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
             fn = os.path.join(temp_dir, f"{path.stem}.pdf")
             with open(fn, 'wb') as f:
@@ -134,7 +134,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
             if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
                 parse_doc(doc_path)
     else:
-        parse_doc(path)
+        parse_doc(Path(path))
 
     shutil.rmtree(temp_dir)
 

+ 6 - 3
next_docs/en/additional_notes/glossary.rst

@@ -4,8 +4,11 @@ Glossary
 ===========
 
 1. jsonl 
-    TODO: add description
+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
+
+
+2. magic-pdf.json 
+    TODO
 
-2. magic-pdf.json
-    TODO: add description
 

+ 1 - 1
next_docs/en/user_guide/install/install.rst

@@ -134,6 +134,6 @@ Windows Platform
 
 .. tip::
 
-    The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
+    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
 
 

+ 0 - 8
next_docs/en/user_guide/quick_start/convert_directory.rst

@@ -1,8 +0,0 @@
-
-
-Convert Files Under Directory 
-=================================
-
-.. code:: python 
-
-    

+ 13 - 0
next_docs/en/user_guide/quick_start/convert_doc.rst

@@ -10,6 +10,19 @@ Convert Doc
     
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
+
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.doc -o output -m auto
+
+
+API 
+^^^^^^^^
 .. code:: python 
 
     import os

+ 12 - 0
next_docs/en/user_guide/quick_start/convert_docx.rst

@@ -10,6 +10,18 @@ Convert DocX
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
 
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.docx -o output -m auto
+
+
+API 
+^^^^^
+
 .. code:: python 
 
     import os

+ 13 - 0
next_docs/en/user_guide/quick_start/convert_image.rst

@@ -3,6 +3,19 @@
 Convert Image
 ===============
 
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.png -o output -m auto
+
+
+API 
+^^^^^^
+
 .. code:: python
 
     import os

+ 11 - 0
next_docs/en/user_guide/quick_start/convert_pdf.rst

@@ -3,6 +3,17 @@
 Convert PDF 
 ============
 
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.pdf -o output -m auto
+
+
+API
+^^^^^^
 .. code:: python
 
     import os

+ 11 - 0
next_docs/en/user_guide/quick_start/convert_ppt.rst

@@ -10,6 +10,17 @@ Convert PPT
     
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.ppt -o output -m auto
+
+
+API 
+^^^^^
 
 .. code:: python 
 

+ 13 - 0
next_docs/en/user_guide/quick_start/convert_pptx.rst

@@ -11,6 +11,19 @@ Convert PPTX
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
 
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.pptx -o output -m auto
+
+
+
+
+API 
+^^^^^^
 
 .. code:: python 
 

+ 0 - 1
next_docs/en/user_guide/tutorial.rst

@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
 .. toctree::
     :maxdepth: 1
 
-    tutorial/output_file_description
     tutorial/pipeline
 

+ 112 - 0
next_docs/en/user_guide/usage/api.rst

@@ -2,6 +2,10 @@
 Api Usage 
 ===========
 
+
+PDF
+----
+
 Local File Example
 ^^^^^^^^^^^^^^^^^^
 
@@ -111,4 +115,112 @@ S3 File Example
     pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
 
 
+
+MS-Office 
+----------
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
+This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
+
+
+Image
+---------
+
+Single Image File 
+^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
+
+Directory That Contains Images 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_directory = "some_image_dir/"       # replace with real directory that contains images
+
+
+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
+
+    count = 0
+    for ds in dss:
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+            md_writer, f"{count}.md", image_dir
+        )
+        count += 1
+
+
 Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details

+ 17 - 2
next_docs/en/user_guide/usage/command_line.rst

@@ -10,7 +10,8 @@ Command Line
 
    Options:
      -v, --version                display the version and exit
-     -p, --path PATH              local pdf filepath or directory  [required]
+     -p, --path PATH              local filepath or directory. support PDF, PPT,
+                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
      -o, --output-dir PATH        output local directory  [required]
      -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
                                   technique to extract information from pdf. txt:
@@ -40,6 +41,20 @@ Command Line
    ## command line example
    magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
 
+
+.. admonition:: Important
+    :class: tip
+
+    The file must endswith with the following suffix.
+       .pdf 
+       .png
+       .jpg
+       .ppt
+       .pptx
+       .doc
+       .docx
+
+
 ``{some_pdf}`` can be a single PDF file or a directory containing
 multiple PDFs. The results will be saved in the ``{some_output_dir}``
 directory. The output file list is as follows:
@@ -59,4 +74,4 @@ directory. The output file list is as follows:
    :class: tip
    
 
-   For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>`
+   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`