Bladeren bron

feat: support convert ppt/pptx/doc/docx

xu rui 11 maanden geleden
bovenliggende
commit
f6af67eb11

+ 33 - 1
magic_pdf/data/read_api.py

@@ -1,12 +1,14 @@
 import json
 import os
+import tempfile
+import shutil
 from pathlib import Path
 
 from magic_pdf.config.exceptions import EmptyData, InvalidParams
 from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
                                                MultiBucketS3DataReader)
 from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
-
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
 
 def read_jsonl(
     s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -71,6 +73,36 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
         bits = reader.read(path)
         return [PymuDocDataset(bits)]
 
+def read_local_office(path: str) -> list[PymuDocDataset]:
+    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
+
+    Args:
+        path (str): ms-office file or directory that contains ms-office files
+
+    Returns:
+        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
+    """
+    suffixes = ['ppt', 'pptx', 'doc', 'docx']
+    fns = []
+    ret = []
+    if os.path.isdir(path):
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = file.split('.')
+                if suffix[-1] in suffixes:
+                    fns.append((os.path.join(root, file)))
+    else:
+        fns.append(path)
+        
+    reader = FileBasedDataReader()
+    temp_dir = tempfile.mkdtemp()
+    for fn in fns:
+        convert_file_to_pdf(fn, temp_dir)
+        fn_path = Path(fn)
+        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
+        ret.append(PymuDocDataset(reader.read(pdf_fn)))
+    shutil.rmtree(temp_dir)
+    return ret
 
 def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
     """Read images from path or directory.

+ 29 - 0
magic_pdf/utils/office_to_pdf.py

@@ -0,0 +1,29 @@
+import os
+import subprocess
+from pathlib import Path
+
+
+class ConvertToPdfError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+        super().__init__(self.msg)
+
+
+def convert_file_to_pdf(input_path, output_dir):
+    if not os.path.isfile(input_path):
+        raise FileNotFoundError(f"The input file {input_path} does not exist.")
+
+    os.makedirs(output_dir, exist_ok=True)
+    
+    cmd = [
+        'soffice',
+        '--headless',
+        '--convert-to', 'pdf',
+        '--outdir', str(output_dir),
+        str(input_path)
+    ]
+    
+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    
+    if process.returncode != 0:
+        raise ConvertToPdfError(process.stderr.decode())

+ 4 - 1
next_docs/en/user_guide/install/config.rst

@@ -153,5 +153,8 @@ config_version
 The version of config schema.
 
 
+.. admonition:: Tip
+    :class: tip
+    
+    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
 
-Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.

+ 28 - 2
next_docs/en/user_guide/install/install.rst

@@ -89,7 +89,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
 
 
 Create an environment
-~~~~~~~~~~~~~~~~~~~~~
+---------------------------
 
 .. code-block:: shell
 
@@ -99,7 +99,7 @@ Create an environment
 
 
 Download model weight files
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------
 
 .. code-block:: shell
 
@@ -108,6 +108,32 @@ Download model weight files
     python download_models_hf.py    
 
 
+
+Install LibreOffice[Optional]
+----------------------------------
+
+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can Skip this section if no need for those filetype processing.
+
+
+Linux/Macos Platform
+""""""""""""""""""""""
+
+.. code::
+
+    apt-get/yum/brew install libreoffice
+
+
+Windows Platform 
+""""""""""""""""""""
+
+.. code::
+
+    install libreoffice 
+    append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
+
+
 .. tip::
 
     The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
+
+