Sfoglia il codice sorgente

Merge pull request #2086 from icecraft/fix/support_non_pdf_in_batch

fix: support non-pdf file in batch mode
Xiaomeng Zhao 7 mesi fa
parent
commit
14097d4ec9
1 ha cambiato i file con 4 aggiunte e 0 eliminazioni
  1. 4 0
      magic_pdf/tools/cli.py

+ 4 - 0
magic_pdf/tools/cli.py

@@ -137,6 +137,10 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
         doc_paths = []
         for doc_path in Path(path).glob('*'):
             if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
+                if doc_path.suffix not in ms_office_suffixes:
+                    basename = Path(doc_path).stem
+                    convert_file_to_pdf(str(doc_path), temp_dir)
+                    doc_path = Path(os.path.join(temp_dir, f'{basename}.pdf'))
                 doc_paths.append(doc_path)
         datasets = batch_build_dataset(doc_paths, 4, lang)
         batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)