analysis_view.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. import json
  2. import threading
  3. from multiprocessing import Process
  4. from pathlib import Path
  5. from flask import request, current_app, url_for
  6. from flask_restful import Resource
  7. from .ext import find_file, task_state_map
  8. # from .formula_ext import formula_detection, formula_recognition
  9. from .serialization import AnalysisViewSchema
  10. from marshmallow import ValidationError
  11. from ..extentions import db
  12. from .models import AnalysisTask, AnalysisPdf
  13. from .pdf_ext import analysis_pdf_task
  14. from common.custom_response import generate_response
  15. class AnalysisTaskProgressView(Resource):
  16. def get(self):
  17. """
  18. 获取任务进度
  19. :return:
  20. """
  21. params = request.args
  22. id = params.get('id')
  23. analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id).first()
  24. if not analysis_task:
  25. return generate_response(code=400, msg="Invalid ID", msgZH="无效id")
  26. match analysis_task.task_type:
  27. case 'pdf':
  28. analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == analysis_task.analysis_pdf_id).first()
  29. file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
  30. file_name_split = analysis_task.file_name.split("_")
  31. file_name = file_name_split[-1] if file_name_split else analysis_task.file_name
  32. if analysis_task.status == 0:
  33. data = {
  34. "state": task_state_map.get(analysis_task.status),
  35. "status": analysis_pdf.status,
  36. "url": file_url,
  37. "fileName": file_name,
  38. "file_key": analysis_task.file_key,
  39. "content": [],
  40. "markdownUrl": [],
  41. "fullMdLink": "",
  42. "type": analysis_task.task_type,
  43. }
  44. return generate_response(data=data)
  45. elif analysis_task.status == 1:
  46. if analysis_pdf.status == 1: # 任务正常完成
  47. bbox_info = json.loads(analysis_pdf.bbox_info)
  48. md_link_list = json.loads(analysis_pdf.md_link_list)
  49. full_md_link = analysis_pdf.full_md_link
  50. data = {
  51. "state": task_state_map.get(analysis_task.status),
  52. "status": analysis_pdf.status,
  53. "url": file_url,
  54. "fileName": file_name,
  55. "file_key": analysis_task.file_key,
  56. "content": bbox_info,
  57. "markdownUrl": md_link_list,
  58. "fullMdLink": full_md_link,
  59. "type": analysis_task.task_type,
  60. }
  61. return generate_response(data=data)
  62. else: # 任务异常结束
  63. data = {
  64. "state": "failed",
  65. "status": analysis_pdf.status,
  66. "url": file_url,
  67. "fileName": file_name,
  68. "file_key": analysis_task.file_key,
  69. "content": [],
  70. "markdownUrl": [],
  71. "fullMdLink": "",
  72. "type": analysis_task.task_type,
  73. }
  74. return generate_response(code=-60004, data=data, msg="Failed to retrieve PDF parsing progress",
  75. msgZh="无法获取PDF解析进度")
  76. else:
  77. data = {
  78. "state": task_state_map.get(analysis_task.status),
  79. "status": analysis_pdf.status,
  80. "url": file_url,
  81. "fileName": file_name,
  82. "file_key": analysis_task.file_key,
  83. "content": [],
  84. "markdownUrl": [],
  85. "fullMdLink": "",
  86. "type": analysis_task.task_type,
  87. }
  88. return generate_response(data=data)
  89. case 'formula-detect':
  90. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  91. case 'formula-extract':
  92. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  93. case 'table-recogn':
  94. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  95. case _:
  96. return generate_response(code=400, msg="Not yet supported", msgZH="参数不支持")
  97. class AnalysisTaskView(Resource):
  98. def post(self):
  99. """
  100. 提交任务
  101. :return:
  102. """
  103. analysis_view_schema = AnalysisViewSchema()
  104. try:
  105. params = analysis_view_schema.load(request.get_json())
  106. except ValidationError as err:
  107. return generate_response(code=400, msg=err.messages)
  108. file_key = params.get("fileKey")
  109. file_name = params.get("fileName")
  110. task_type = params.get("taskType")
  111. is_ocr = params.get("isOcr", False)
  112. pdf_upload_folder = current_app.config['PDF_UPLOAD_FOLDER']
  113. upload_dir = f"{current_app.static_folder}/{pdf_upload_folder}"
  114. file_path = find_file(file_key, upload_dir)
  115. match task_type:
  116. case 'pdf':
  117. if not file_path:
  118. return generate_response(code=400, msg="FileKey is invalid, no PDF file found",
  119. msgZH="fileKey无效,未找到pdf文件")
  120. analysis_task = AnalysisTask.query.filter(AnalysisTask.status.in_([0, 2])).first()
  121. file_name = Path(file_path).name
  122. with db.auto_commit():
  123. analysis_pdf_object = AnalysisPdf(
  124. file_name=file_name,
  125. file_path=file_path,
  126. status=3 if analysis_task else 0,
  127. )
  128. db.session.add(analysis_pdf_object)
  129. db.session.flush()
  130. analysis_pdf_id = analysis_pdf_object.id
  131. with db.auto_commit():
  132. analysis_task_object = AnalysisTask(
  133. file_key=file_key,
  134. file_name=file_name,
  135. task_type=task_type,
  136. is_ocr=is_ocr,
  137. status=2 if analysis_task else 0,
  138. analysis_pdf_id=analysis_pdf_id
  139. )
  140. db.session.add(analysis_task_object)
  141. db.session.flush()
  142. analysis_task_id = analysis_task_object.id
  143. if not analysis_task: # 已有同类型任务在执行,请等待执行完成
  144. file_stem = Path(file_path).stem
  145. pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
  146. pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
  147. image_dir = f"{pdf_dir}/images"
  148. t = threading.Thread(target=analysis_pdf_task,
  149. args=(pdf_dir, image_dir, file_path, is_ocr, analysis_pdf_id))
  150. t.start()
  151. # 生成文件的URL路径
  152. file_url = url_for('analysis.uploadpdfview', filename=file_name, as_attachment=False)
  153. data = {
  154. "url": file_url,
  155. "fileName": file_name,
  156. "id": analysis_task_id
  157. }
  158. return generate_response(data=data)
  159. case 'formula-detect':
  160. # if not file_path:
  161. # return generate_response(code=400, msg="FileKey is invalid, no image file found",
  162. # msgZH="fileKey无效,未找到图片")
  163. # return formula_detection(file_path, upload_dir)
  164. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  165. case 'formula-extract':
  166. # if not file_path:
  167. # return generate_response(code=400, msg="FileKey is invalid, no image file found",
  168. # msgZH="fileKey无效,未找到图片")
  169. # return formula_recognition(file_path, upload_dir)
  170. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  171. case 'table-recogn':
  172. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  173. case _:
  174. return generate_response(code=400, msg="Not yet supported", msgZH="参数不支持")
  175. def put(self):
  176. """
  177. 重新发起任务
  178. :return:
  179. """
  180. params = json.loads(request.data)
  181. id = params.get('id')
  182. analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id).first()
  183. if not analysis_task:
  184. return generate_response(code=400, msg="Invalid ID", msgZH="无效id")
  185. match analysis_task.task_type:
  186. case 'pdf':
  187. task_r_p = AnalysisTask.query.filter(AnalysisTask.status.in_([0, 2])).first()
  188. if task_r_p:
  189. with db.auto_commit():
  190. analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_task.analysis_pdf_id).first()
  191. analysis_pdf_object.status = 3
  192. db.session.add(analysis_pdf_object)
  193. with db.auto_commit():
  194. analysis_task.status = 2
  195. db.session.add(analysis_task)
  196. else:
  197. with db.auto_commit():
  198. analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_task.analysis_pdf_id).first()
  199. analysis_pdf_object.status = 0
  200. db.session.add(analysis_pdf_object)
  201. with db.auto_commit():
  202. analysis_task.status = 0
  203. db.session.add(analysis_task)
  204. pdf_upload_folder = current_app.config['PDF_UPLOAD_FOLDER']
  205. upload_dir = f"{current_app.static_folder}/{pdf_upload_folder}"
  206. file_path = find_file(analysis_task.file_key, upload_dir)
  207. file_stem = Path(file_path).stem
  208. pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
  209. pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
  210. image_dir = f"{pdf_dir}/images"
  211. process = Process(target=analysis_pdf_task,
  212. args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr,
  213. analysis_task.analysis_pdf_id))
  214. process.start()
  215. # 生成文件的URL路径
  216. file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
  217. file_name_split = analysis_task.file_name.split("_")
  218. new_file_name = file_name_split[-1] if file_name_split else analysis_task.file_name
  219. data = {
  220. "url": file_url,
  221. "fileName": new_file_name,
  222. "id": analysis_task.id
  223. }
  224. return generate_response(data=data)
  225. case 'formula-detect':
  226. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  227. case 'formula-extract':
  228. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  229. case 'table-recogn':
  230. return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
  231. case _:
  232. return generate_response(code=400, msg="Not yet supported", msgZH="参数不支持")