cli.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. import os
  2. import shutil
  3. import tempfile
  4. from pathlib import Path
  5. import click
  6. import fitz
  7. from loguru import logger
  8. import magic_pdf.model as model_config
  9. from magic_pdf.data.batch_build_dataset import batch_build_dataset
  10. from magic_pdf.data.data_reader_writer import FileBasedDataReader
  11. from magic_pdf.data.dataset import Dataset
  12. from magic_pdf.libs.version import __version__
  13. from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
  14. from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
  15. pdf_suffixes = ['.pdf']
  16. ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
  17. image_suffixes = ['.png', '.jpeg', '.jpg']
  18. @click.command()
  19. @click.version_option(__version__,
  20. '--version',
  21. '-v',
  22. help='display the version and exit')
  23. @click.option(
  24. '-p',
  25. '--path',
  26. 'path',
  27. type=click.Path(exists=True),
  28. required=True,
  29. help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
  30. )
  31. @click.option(
  32. '-o',
  33. '--output-dir',
  34. 'output_dir',
  35. type=click.Path(),
  36. required=True,
  37. help='output local directory',
  38. )
  39. @click.option(
  40. '-m',
  41. '--method',
  42. 'method',
  43. type=parse_pdf_methods,
  44. help="""the method for parsing pdf.
  45. ocr: using ocr technique to extract information from pdf.
  46. txt: suitable for the text-based pdf only and outperform ocr.
  47. auto: automatically choose the best method for parsing pdf from ocr and txt.
  48. without method specified, auto will be used by default.""",
  49. default='auto',
  50. )
  51. @click.option(
  52. '-l',
  53. '--lang',
  54. 'lang',
  55. type=str,
  56. help="""
  57. Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
  58. You should input "Abbreviation" with language form url:
  59. https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
  60. """,
  61. default=None,
  62. )
  63. @click.option(
  64. '-d',
  65. '--debug',
  66. 'debug_able',
  67. type=bool,
  68. help='Enables detailed debugging information during the execution of the CLI commands.',
  69. default=False,
  70. )
  71. @click.option(
  72. '-s',
  73. '--start',
  74. 'start_page_id',
  75. type=int,
  76. help='The starting page for PDF parsing, beginning from 0.',
  77. default=0,
  78. )
  79. @click.option(
  80. '-e',
  81. '--end',
  82. 'end_page_id',
  83. type=int,
  84. help='The ending page for PDF parsing, beginning from 0.',
  85. default=None,
  86. )
  87. def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
  88. os.makedirs(output_dir, exist_ok=True)
  89. temp_dir = tempfile.mkdtemp()
  90. def read_fn(path: Path):
  91. if path.suffix in ms_office_suffixes:
  92. convert_file_to_pdf(str(path), temp_dir)
  93. fn = os.path.join(temp_dir, f'{path.stem}.pdf')
  94. elif path.suffix in image_suffixes:
  95. with open(str(path), 'rb') as f:
  96. bits = f.read()
  97. pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
  98. fn = os.path.join(temp_dir, f'{path.stem}.pdf')
  99. with open(fn, 'wb') as f:
  100. f.write(pdf_bytes)
  101. elif path.suffix in pdf_suffixes:
  102. fn = str(path)
  103. else:
  104. raise Exception(f'Unknown file suffix: {path.suffix}')
  105. disk_rw = FileBasedDataReader(os.path.dirname(fn))
  106. return disk_rw.read(os.path.basename(fn))
  107. def parse_doc(doc_path: Path, dataset: Dataset | None = None):
  108. try:
  109. file_name = str(Path(doc_path).stem)
  110. if dataset is None:
  111. pdf_data_or_dataset = read_fn(doc_path)
  112. else:
  113. pdf_data_or_dataset = dataset
  114. do_parse(
  115. output_dir,
  116. file_name,
  117. pdf_data_or_dataset,
  118. [],
  119. method,
  120. debug_able,
  121. start_page_id=start_page_id,
  122. end_page_id=end_page_id,
  123. lang=lang
  124. )
  125. except Exception as e:
  126. logger.exception(e)
  127. if os.path.isdir(path):
  128. doc_paths = []
  129. for doc_path in Path(path).glob('*'):
  130. if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
  131. if doc_path.suffix in ms_office_suffixes:
  132. convert_file_to_pdf(str(doc_path), temp_dir)
  133. doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
  134. elif doc_path.suffix in image_suffixes:
  135. with open(str(doc_path), 'rb') as f:
  136. bits = f.read()
  137. pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
  138. fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
  139. with open(fn, 'wb') as f:
  140. f.write(pdf_bytes)
  141. doc_path = Path(fn)
  142. doc_paths.append(doc_path)
  143. datasets = batch_build_dataset(doc_paths, 4, lang)
  144. batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
  145. else:
  146. parse_doc(Path(path))
  147. shutil.rmtree(temp_dir)
  148. if __name__ == '__main__':
  149. cli()