app.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import copy
  2. import json
  3. import os
  4. from tempfile import NamedTemporaryFile
  5. import magic_pdf.model as model_config
  6. import uvicorn
  7. from fastapi import FastAPI, File, UploadFile, Form
  8. from fastapi.responses import JSONResponse
  9. from loguru import logger
  10. from magic_pdf.pipe.OCRPipe import OCRPipe
  11. from magic_pdf.pipe.TXTPipe import TXTPipe
  12. from magic_pdf.pipe.UNIPipe import UNIPipe
  13. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  14. model_config.__use_inside_model__ = True
  15. app = FastAPI()
  16. def json_md_dump(
  17. pipe,
  18. md_writer,
  19. pdf_name,
  20. content_list,
  21. md_content,
  22. ):
  23. # Write model results to model.json
  24. orig_model_list = copy.deepcopy(pipe.model_list)
  25. md_writer.write(
  26. content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
  27. path=f"{pdf_name}_model.json"
  28. )
  29. # Write intermediate results to middle.json
  30. md_writer.write(
  31. content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
  32. path=f"{pdf_name}_middle.json"
  33. )
  34. # Write text content results to content_list.json
  35. md_writer.write(
  36. content=json.dumps(content_list, ensure_ascii=False, indent=4),
  37. path=f"{pdf_name}_content_list.json"
  38. )
  39. # Write results to .md file
  40. md_writer.write(
  41. content=md_content,
  42. path=f"{pdf_name}.md"
  43. )
  44. @app.post("/pdf_parse", tags=["projects"], summary="Parse PDF file")
  45. async def pdf_parse_main(
  46. pdf_file: UploadFile = File(...),
  47. parse_method: str = 'auto',
  48. model_json_path: str = None,
  49. is_json_md_dump: bool = True,
  50. output_dir: str = "output"
  51. ):
  52. """
  53. Execute the process of converting PDF to JSON and MD, outputting MD and JSON files to the specified directory
  54. :param pdf_file: The PDF file to be parsed
  55. :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
  56. :param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
  57. :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file
  58. :param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
  59. """
  60. try:
  61. # Create a temporary file to store the uploaded PDF
  62. with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
  63. temp_pdf.write(await pdf_file.read())
  64. temp_pdf_path = temp_pdf.name
  65. pdf_name = os.path.basename(pdf_file.filename).split(".")[0]
  66. if output_dir:
  67. output_path = os.path.join(output_dir, pdf_name)
  68. else:
  69. output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)
  70. output_image_path = os.path.join(output_path, 'images')
  71. # Get parent path of images for relative path in .md and content_list.json
  72. image_path_parent = os.path.basename(output_image_path)
  73. pdf_bytes = open(temp_pdf_path, "rb").read() # Read binary data of PDF file
  74. if model_json_path:
  75. # Read original JSON data of PDF file parsed by model, list type
  76. model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
  77. else:
  78. model_json = []
  79. # Execute parsing steps
  80. image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
  81. # Choose parsing method
  82. if parse_method == "auto":
  83. jso_useful_key = {"_pdf_type": "", "model_list": model_json}
  84. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
  85. elif parse_method == "txt":
  86. pipe = TXTPipe(pdf_bytes, model_json, image_writer)
  87. elif parse_method == "ocr":
  88. pipe = OCRPipe(pdf_bytes, model_json, image_writer)
  89. else:
  90. logger.error("Unknown parse method, only auto, ocr, txt allowed")
  91. return JSONResponse(content={"error": "Invalid parse method"}, status_code=400)
  92. # Execute classification
  93. pipe.pipe_classify()
  94. # If no model data is provided, use built-in model for parsing
  95. if not model_json:
  96. if model_config.__use_inside_model__:
  97. pipe.pipe_analyze() # Parse
  98. else:
  99. logger.error("Need model list input")
  100. return JSONResponse(content={"error": "Model list input required"}, status_code=400)
  101. # Execute parsing
  102. pipe.pipe_parse()
  103. # Save results in text and md format
  104. content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
  105. md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
  106. if is_json_md_dump:
  107. json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
  108. data = {"layout": copy.deepcopy(pipe.model_list), "info": pipe.pdf_mid_data, "content_list": content_list,'md_content':md_content}
  109. return JSONResponse(data, status_code=200)
  110. except Exception as e:
  111. logger.exception(e)
  112. return JSONResponse(content={"error": str(e)}, status_code=500)
  113. finally:
  114. # Clean up the temporary file
  115. if 'temp_pdf_path' in locals():
  116. os.unlink(temp_pdf_path)
  117. # if __name__ == '__main__':
  118. # uvicorn.run(app, host="0.0.0.0", port=8888)