pdf_ext.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import json
  2. import os
  3. import shutil
  4. import traceback
  5. from pathlib import Path
  6. from common.error_types import ApiException
  7. from common.mk_markdown.mk_markdown import \
  8. ocr_mk_mm_markdown_with_para_and_pagination
  9. from flask import current_app, url_for
  10. from loguru import logger
  11. import magic_pdf.model as model_config
  12. from magic_pdf.data.data_reader_writer import FileBasedDataWriter
  13. from magic_pdf.libs.json_compressor import JsonCompressor
  14. from magic_pdf.pipe.UNIPipe import UNIPipe
  15. from ..extensions import app, db
  16. from .ext import find_file
  17. from .models import AnalysisPdf, AnalysisTask
  18. model_config.__use_inside_model__ = True
  19. def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
  20. try:
  21. model_json = [] # model_json传空list使用内置模型解析
  22. logger.info(f'is_ocr: {is_ocr}')
  23. if not is_ocr:
  24. jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
  25. image_writer = FileBasedDataWriter(image_dir)
  26. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  27. pipe.pipe_classify()
  28. else:
  29. jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
  30. image_writer = FileBasedDataWriter(image_dir)
  31. pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
  32. """如果没有传入有效的模型数据,则使用内置model解析"""
  33. if len(model_json) == 0:
  34. if model_config.__use_inside_model__:
  35. pipe.pipe_analyze()
  36. else:
  37. logger.error('need model list input')
  38. exit(1)
  39. pipe.pipe_parse()
  40. pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
  41. pdf_info_list = pdf_mid_data['pdf_info']
  42. md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
  43. ensure_ascii=False)
  44. bbox_info = get_bbox_info(pdf_info_list)
  45. return md_content, bbox_info
  46. except Exception as e: # noqa: F841
  47. logger.error(traceback.format_exc())
  48. def get_bbox_info(data):
  49. bbox_info = []
  50. for page in data:
  51. preproc_blocks = page.get('preproc_blocks', [])
  52. discarded_blocks = page.get('discarded_blocks', [])
  53. bbox_info.append({
  54. 'preproc_blocks': preproc_blocks,
  55. 'page_idx': page.get('page_idx'),
  56. 'page_size': page.get('page_size'),
  57. 'discarded_blocks': discarded_blocks,
  58. })
  59. return bbox_info
  60. def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
  61. """解析pdf.
  62. :param pdf_dir: pdf解析目录
  63. :param image_dir: 图片目录
  64. :param pdf_path: pdf路径
  65. :param is_ocr: 是否启用ocr
  66. :param analysis_pdf_id: pdf解析表id
  67. :return:
  68. """
  69. try:
  70. logger.info(f'start task: {pdf_path}')
  71. logger.info(f'image_dir: {image_dir}')
  72. if not Path(image_dir).exists():
  73. Path(image_dir).mkdir(parents=True, exist_ok=True)
  74. else:
  75. # 清空image_dir,避免同文件多次解析图片积累
  76. shutil.rmtree(image_dir, ignore_errors=True)
  77. os.makedirs(image_dir, exist_ok=True)
  78. # 获取文件内容
  79. with open(pdf_path, 'rb') as file:
  80. pdf_bytes = file.read()
  81. # 生成图片链接
  82. with app.app_context():
  83. image_url_prefix = f"http://{current_app.config['SERVER_NAME']}{current_app.config['FILE_API']}&pdf={Path(pdf_path).name}&filename="
  84. # 解析文件
  85. md_content, bbox_info = analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr)
  86. # ############ markdown #############
  87. pdf_name = Path(pdf_path).name
  88. full_md_content = ''
  89. for item in json.loads(md_content):
  90. full_md_content += item['md_content'] + '\n'
  91. full_md_name = 'full.md'
  92. with open(f'{pdf_dir}/{full_md_name}', 'w', encoding='utf-8') as file:
  93. file.write(full_md_content)
  94. with app.app_context():
  95. full_md_link = url_for('analysis.mdview', filename=full_md_name, as_attachment=False)
  96. full_md_link = f'{full_md_link}&pdf={pdf_name}'
  97. md_link_list = []
  98. with app.app_context():
  99. for n, md in enumerate(json.loads(md_content)):
  100. md_content = md['md_content']
  101. md_name = f"{md.get('page_no', n)}.md"
  102. with open(f'{pdf_dir}/{md_name}', 'w', encoding='utf-8') as file:
  103. file.write(md_content)
  104. md_url = url_for('analysis.mdview', filename=md_name, as_attachment=False)
  105. md_link_list.append(f'{md_url}&pdf={pdf_name}')
  106. with app.app_context():
  107. with db.auto_commit():
  108. analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_pdf_id).first()
  109. analysis_pdf_object.status = 1
  110. analysis_pdf_object.bbox_info = json.dumps(bbox_info, ensure_ascii=False)
  111. analysis_pdf_object.md_link_list = json.dumps(md_link_list, ensure_ascii=False)
  112. analysis_pdf_object.full_md_link = full_md_link
  113. db.session.add(analysis_pdf_object)
  114. with db.auto_commit():
  115. analysis_task_object = AnalysisTask.query.filter_by(analysis_pdf_id=analysis_pdf_id).first()
  116. analysis_task_object.status = 1
  117. db.session.add(analysis_task_object)
  118. logger.info('finished!')
  119. except Exception as e: # noqa: F841
  120. logger.error(traceback.format_exc())
  121. with app.app_context():
  122. with db.auto_commit():
  123. analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_pdf_id).first()
  124. analysis_pdf_object.status = 2
  125. db.session.add(analysis_pdf_object)
  126. with db.auto_commit():
  127. analysis_task_object = AnalysisTask.query.filter_by(analysis_pdf_id=analysis_pdf_id).first()
  128. analysis_task_object.status = 1
  129. db.session.add(analysis_task_object)
  130. raise ApiException(code=500, msg='PDF parsing failed', msgZH='pdf解析失败')
  131. finally:
  132. # 执行pending
  133. with app.app_context():
  134. analysis_task_object = AnalysisTask.query.filter_by(status=2).order_by(
  135. AnalysisTask.update_date.asc()).first()
  136. if analysis_task_object:
  137. pdf_upload_folder = current_app.config['PDF_UPLOAD_FOLDER']
  138. upload_dir = f'{current_app.static_folder}/{pdf_upload_folder}'
  139. file_path = find_file(analysis_task_object.file_key, upload_dir)
  140. file_stem = Path(file_path).stem
  141. pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
  142. pdf_dir = f'{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}'
  143. image_dir = f'{pdf_dir}/images'
  144. with db.auto_commit():
  145. analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_task_object.analysis_pdf_id).first()
  146. analysis_pdf_object.status = 0
  147. db.session.add(analysis_pdf_object)
  148. with db.auto_commit():
  149. analysis_task_object.status = 0
  150. db.session.add(analysis_task_object)
  151. analysis_pdf_task(pdf_dir, image_dir, file_path, analysis_task_object.is_ocr, analysis_task_object.analysis_pdf_id)
  152. else:
  153. logger.info('all task finished!')