Ver Fonte

docs/update_docs

icecraft há 10 meses atrás
pai
commit
87a6c51c77

+ 0 - 58
next_docs/en/user_guide/quick_start/convert_docx.rst

@@ -1,58 +0,0 @@
-
-Convert DocX
-=============
-
-.. admonition:: Warning
-    :class: tip
-
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-
-
-Command Line
-^^^^^^^^^^^^^
-
-.. code:: python
-
-    # make sure the file have correct suffix
-    magic-pdf -p a.docx -o output -m auto
-
-
-API
-^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_docx.docx"     # replace with real ms-office file
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )

+ 0 - 5
next_docs/en/user_guide/quick_start/convert_image.rst

@@ -45,8 +45,3 @@ API
     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
-
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )

+ 11 - 10
next_docs/en/user_guide/quick_start/convert_doc.rst → next_docs/en/user_guide/quick_start/convert_ms_office.rst

@@ -17,7 +17,7 @@ Command Line
 
 .. code:: python
 
-    # make sure the file have correct suffix
+    # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
     magic-pdf -p a.doc -o output -m auto
 
 
@@ -30,6 +30,8 @@ API
     from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
     from magic_pdf.data.read_api import read_local_office
+    from magic_pdf.config.enums import SupportedPdfParseMethod
+
 
     # prepare env
     local_image_dir, local_md_dir = "output/images", "output"
@@ -43,17 +45,16 @@ API
 
     # proc
     ## Create Dataset Instance
-    input_file = "some_doc.doc"     # replace with real ms-office file
+    input_file = "some_doc.doc"     # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
 
     input_file_name = input_file.split(".")[0]
     ds = read_local_office(input_file)[0]
 
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
 
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir)
+    else:
+        ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir)

+ 5 - 4
next_docs/en/user_guide/quick_start/convert_pdf.rst

@@ -44,12 +44,13 @@ API
     ## Create Dataset Instance
     ds = PymuDocDataset(pdf_bytes)
 
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{name_without_suff}.md", image_dir
     )
 
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+    else:
+        ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
         md_writer, f"{name_without_suff}.md", image_dir
     )

+ 0 - 58
next_docs/en/user_guide/quick_start/convert_ppt.rst

@@ -1,58 +0,0 @@
-
-
-Convert PPT
-============
-
-.. admonition:: Warning
-    :class: tip
-
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-
-Command Line
-^^^^^^^^^^^^^
-
-.. code:: python
-
-    # make sure the file have correct suffix
-    magic-pdf -p a.ppt -o output -m auto
-
-
-API
-^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_ppt.ppt"     # replace with real ms-office file
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )

+ 0 - 61
next_docs/en/user_guide/quick_start/convert_pptx.rst

@@ -1,61 +0,0 @@
-
-
-Convert PPTX
-=================
-
-.. admonition:: Warning
-    :class: tip
-
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-
-
-Command Line
-^^^^^^^^^^^^^
-
-.. code:: python
-
-    # make sure the file have correct suffix
-    magic-pdf -p a.pptx -o output -m auto
-
-
-
-
-API
-^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_pptx.pptx"     # replace with real ms-office file
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )