app.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. import copy
  2. import json
  3. import os
  4. from tempfile import NamedTemporaryFile
  5. import uvicorn
  6. from fastapi import FastAPI, File, UploadFile
  7. from fastapi.responses import JSONResponse
  8. from loguru import logger
  9. import magic_pdf.model as model_config
  10. from magic_pdf.config.enums import SupportedPdfParseMethod
  11. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  12. from magic_pdf.data.dataset import PymuDocDataset
  13. from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  14. from magic_pdf.model.operators import InferenceResult
  15. model_config.__use_inside_model__ = True
  16. app = FastAPI()
  17. def json_md_dump(
  18. model_json,
  19. middle_json,
  20. md_writer,
  21. pdf_name,
  22. content_list,
  23. md_content,
  24. ):
  25. # Write model results to model.json
  26. orig_model_list = copy.deepcopy(model_json)
  27. md_writer.write_string(
  28. f'{pdf_name}_model.json',
  29. json.dumps(orig_model_list, ensure_ascii=False, indent=4),
  30. )
  31. # Write intermediate results to middle.json
  32. md_writer.write_string(
  33. f'{pdf_name}_middle.json',
  34. json.dumps(middle_json, ensure_ascii=False, indent=4),
  35. )
  36. # Write text content results to content_list.json
  37. md_writer.write_string(
  38. f'{pdf_name}_content_list.json',
  39. json.dumps(content_list, ensure_ascii=False, indent=4),
  40. )
  41. # Write results to .md file
  42. md_writer.write_string(
  43. f'{pdf_name}.md',
  44. md_content,
  45. )
  46. @app.post('/pdf_parse', tags=['projects'], summary='Parse PDF file')
  47. async def pdf_parse_main(
  48. pdf_file: UploadFile = File(...),
  49. parse_method: str = 'auto',
  50. model_json_path: str = None,
  51. is_json_md_dump: bool = True,
  52. output_dir: str = 'output',
  53. ):
  54. """Execute the process of converting PDF to JSON and MD, outputting MD and
  55. JSON files to the specified directory.
  56. :param pdf_file: The PDF file to be parsed
  57. :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
  58. :param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
  59. :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file # noqa E501
  60. :param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
  61. """
  62. try:
  63. # Create a temporary file to store the uploaded PDF
  64. with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
  65. temp_pdf.write(await pdf_file.read())
  66. temp_pdf_path = temp_pdf.name
  67. pdf_name = os.path.basename(pdf_file.filename).split('.')[0]
  68. if output_dir:
  69. output_path = os.path.join(output_dir, pdf_name)
  70. else:
  71. output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)
  72. output_image_path = os.path.join(output_path, 'images')
  73. # Get parent path of images for relative path in .md and content_list.json
  74. image_path_parent = os.path.basename(output_image_path)
  75. pdf_bytes = open(temp_pdf_path, 'rb').read() # Read binary data of PDF file
  76. if model_json_path:
  77. # Read original JSON data of PDF file parsed by model, list type
  78. model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read())
  79. else:
  80. model_json = []
  81. # Execute parsing steps
  82. image_writer, md_writer = FileBasedDataWriter(
  83. output_image_path
  84. ), FileBasedDataWriter(output_path)
  85. ds = PymuDocDataset(pdf_bytes)
  86. # Choose parsing method
  87. if parse_method == 'auto':
  88. if ds.classify() == SupportedPdfParseMethod.OCR:
  89. parse_method = 'ocr'
  90. else:
  91. parse_method = 'txt'
  92. if parse_method not in ['txt', 'ocr']:
  93. logger.error('Unknown parse method, only auto, ocr, txt allowed')
  94. return JSONResponse(
  95. content={'error': 'Invalid parse method'}, status_code=400
  96. )
  97. if len(model_json) == 0:
  98. if parse_method == 'ocr':
  99. infer_result = ds.apply(doc_analyze, ocr=True)
  100. else:
  101. infer_result = ds.apply(doc_analyze, ocr=False)
  102. else:
  103. infer_result = InferenceResult(model_json, ds)
  104. if len(model_json) == 0 and not model_config.__use_inside_model__:
  105. logger.error('Need model list input')
  106. return JSONResponse(
  107. content={'error': 'Model list input required'}, status_code=400
  108. )
  109. if parse_method == 'ocr':
  110. pipe_res = infer_result.pipe_ocr_mode(image_writer)
  111. else:
  112. pipe_res = infer_result.pipe_txt_mode(image_writer)
  113. # Save results in text and md format
  114. content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
  115. md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
  116. if is_json_md_dump:
  117. json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
  118. data = {
  119. 'layout': copy.deepcopy(infer_result._infer_res),
  120. 'info': pipe_res._pipe_res,
  121. 'content_list': content_list,
  122. 'md_content': md_content,
  123. }
  124. return JSONResponse(data, status_code=200)
  125. except Exception as e:
  126. logger.exception(e)
  127. return JSONResponse(content={'error': str(e)}, status_code=500)
  128. finally:
  129. # Clean up the temporary file
  130. if 'temp_pdf_path' in locals():
  131. os.unlink(temp_pdf_path)
  132. if __name__ == '__main__':
  133. uvicorn.run(app, host='0.0.0.0', port=8888)