ocr_pipeline.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. # 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
  2. import sys
  3. import time
  4. from loguru import logger
  5. from app.common.s3 import get_s3_config
  6. from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown, ocr_mk_nlp_markdown_with_para, \
  7. ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_mm_markdown_with_para, ocr_mk_mm_standard_format, \
  8. make_standard_format_with_para
  9. from magic_pdf.libs.commons import s3_image_save_path, formatted_time, join_path
  10. from magic_pdf.libs.json_compressor import JsonCompressor
  11. from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
  12. from magic_pdf.pipeline import get_data_source, exception_handler
  13. def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  14. if not jso.get("need_drop", False):
  15. return jso
  16. else:
  17. jso = ocr_parse_pdf_core(
  18. jso, start_page_id=start_page_id, debug_mode=debug_mode
  19. )
  20. jso["need_drop"] = False
  21. return jso
  22. def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  23. # 检测debug开关
  24. if debug_mode:
  25. pass
  26. else: # 如果debug没开,则检测是否有needdrop字段
  27. if jso.get("need_drop", False):
  28. return jso
  29. jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
  30. return jso
  31. def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict:
  32. s3_pdf_path = jso.get("file_location")
  33. s3_config = get_s3_config(s3_pdf_path)
  34. model_output_json_list = jso.get("doc_layout_result")
  35. data_source = get_data_source(jso)
  36. file_id = jso.get("file_id")
  37. book_name = f"{data_source}/{file_id}"
  38. try:
  39. save_path = s3_image_save_path
  40. image_s3_config = get_s3_config(save_path)
  41. start_time = time.time() # 记录开始时间
  42. # 先打印一下book_name和解析开始的时间
  43. logger.info(
  44. f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
  45. file=sys.stderr,
  46. )
  47. pdf_info_dict = parse_pdf_by_ocr(
  48. s3_pdf_path,
  49. s3_config,
  50. model_output_json_list,
  51. save_path,
  52. book_name,
  53. pdf_model_profile=None,
  54. image_s3_config=image_s3_config,
  55. start_page_id=start_page_id,
  56. debug_mode=debug_mode,
  57. )
  58. pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
  59. jso["pdf_intermediate_dict"] = pdf_info_dict
  60. end_time = time.time() # 记录完成时间
  61. parse_time = int(end_time - start_time) # 计算执行时间
  62. # 解析完成后打印一下book_name和耗时
  63. logger.info(
  64. f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
  65. file=sys.stderr,
  66. )
  67. jso["parse_time"] = parse_time
  68. except Exception as e:
  69. jso = exception_handler(jso, e)
  70. return jso
  71. def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  72. if debug_mode:
  73. pass
  74. else: # 如果debug没开,则检测是否有needdrop字段
  75. if jso.get("need_drop", False):
  76. book_name = join_path(get_data_source(jso), jso["file_id"])
  77. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  78. jso["dropped"] = True
  79. return jso
  80. try:
  81. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  82. # 将 pdf_intermediate_dict 解压
  83. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  84. markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
  85. jso["content"] = markdown_content
  86. logger.info(
  87. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  88. file=sys.stderr,
  89. )
  90. # 把无用的信息清空
  91. jso["doc_layout_result"] = ""
  92. jso["pdf_intermediate_dict"] = ""
  93. jso["pdf_meta"] = ""
  94. except Exception as e:
  95. jso = exception_handler(jso, e)
  96. return jso
  97. def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict:
  98. if debug_mode:
  99. pass
  100. else: # 如果debug没开,则检测是否有needdrop字段
  101. if jso.get("need_drop", False):
  102. book_name = join_path(get_data_source(jso), jso["file_id"])
  103. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  104. jso["dropped"] = True
  105. return jso
  106. try:
  107. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  108. # 将 pdf_intermediate_dict 解压
  109. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  110. # markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  111. markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
  112. jso["content"] = markdown_content
  113. logger.info(
  114. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  115. file=sys.stderr,
  116. )
  117. # 把无用的信息清空
  118. jso["doc_layout_result"] = ""
  119. jso["pdf_intermediate_dict"] = ""
  120. jso["pdf_meta"] = ""
  121. except Exception as e:
  122. jso = exception_handler(jso, e)
  123. return jso
  124. def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, debug_mode=False) -> dict:
  125. if debug_mode:
  126. pass
  127. else: # 如果debug没开,则检测是否有needdrop字段
  128. if jso.get("need_drop", False):
  129. book_name = join_path(get_data_source(jso), jso["file_id"])
  130. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  131. jso["dropped"] = True
  132. return jso
  133. try:
  134. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  135. # 将 pdf_intermediate_dict 解压
  136. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  137. markdown_content = ocr_mk_mm_markdown_with_para_and_pagination(pdf_intermediate_dict)
  138. jso["content"] = markdown_content
  139. logger.info(
  140. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  141. file=sys.stderr,
  142. )
  143. # 把无用的信息清空
  144. # jso["doc_layout_result"] = ""
  145. jso["pdf_intermediate_dict"] = ""
  146. # jso["pdf_meta"] = ""
  147. except Exception as e:
  148. jso = exception_handler(jso, e)
  149. return jso
  150. def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
  151. jso: dict, debug_mode=False
  152. ) -> dict:
  153. if debug_mode:
  154. pass
  155. else: # 如果debug没开,则检测是否有needdrop字段
  156. if jso.get("need_drop", False):
  157. book_name = join_path(get_data_source(jso), jso["file_id"])
  158. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  159. jso["dropped"] = True
  160. return jso
  161. try:
  162. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  163. # 将 pdf_intermediate_dict 解压
  164. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  165. markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  166. jso["content_ocr"] = markdown_content
  167. logger.info(
  168. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  169. file=sys.stderr,
  170. )
  171. # 把无用的信息清空
  172. jso["doc_layout_result"] = ""
  173. jso["pdf_intermediate_dict"] = ""
  174. jso["mid_json_ocr"] = pdf_intermediate_dict
  175. jso["pdf_meta"] = ""
  176. except Exception as e:
  177. jso = exception_handler(jso, e)
  178. return jso
  179. def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> dict:
  180. if debug_mode:
  181. pass
  182. else: # 如果debug没开,则检测是否有needdrop字段
  183. if jso.get("need_drop", False):
  184. book_name = join_path(get_data_source(jso), jso["file_id"])
  185. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  186. jso["dropped"] = True
  187. return jso
  188. try:
  189. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  190. # 将 pdf_intermediate_dict 解压
  191. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  192. standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict)
  193. jso["content_list"] = standard_format
  194. logger.info(
  195. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  196. file=sys.stderr,
  197. )
  198. # 把无用的信息清空
  199. jso["doc_layout_result"] = ""
  200. jso["pdf_intermediate_dict"] = ""
  201. jso["pdf_meta"] = ""
  202. except Exception as e:
  203. jso = exception_handler(jso, e)
  204. return jso
  205. def ocr_pdf_intermediate_dict_to_standard_format_with_para(jso: dict, debug_mode=False) -> dict:
  206. if debug_mode:
  207. pass
  208. else: # 如果debug没开,则检测是否有needdrop字段
  209. if jso.get("need_drop", False):
  210. book_name = join_path(get_data_source(jso), jso["file_id"])
  211. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  212. jso["dropped"] = True
  213. return jso
  214. try:
  215. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  216. # 将 pdf_intermediate_dict 解压
  217. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  218. standard_format = make_standard_format_with_para(pdf_intermediate_dict)
  219. jso["content_list"] = standard_format
  220. logger.info(
  221. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  222. file=sys.stderr,
  223. )
  224. # 把无用的信息清空
  225. jso["doc_layout_result"] = ""
  226. jso["pdf_intermediate_dict"] = ""
  227. jso["pdf_meta"] = ""
  228. except Exception as e:
  229. jso = exception_handler(jso, e)
  230. return jso