pipeline_ocr.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. import sys
  2. from loguru import logger
  3. from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown, ocr_mk_nlp_markdown_with_para, \
  4. ocr_mk_mm_markdown_with_para_and_pagination, ocr_mk_mm_markdown_with_para, ocr_mk_mm_standard_format, \
  5. make_standard_format_with_para
  6. from magic_pdf.libs.commons import join_path
  7. from magic_pdf.libs.json_compressor import JsonCompressor
  8. from magic_pdf.spark.base import get_data_source, exception_handler
  9. def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
  10. if debug_mode:
  11. pass
  12. else: # 如果debug没开,则检测是否有needdrop字段
  13. if jso.get("need_drop", False):
  14. book_name = join_path(get_data_source(jso), jso["file_id"])
  15. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  16. jso["dropped"] = True
  17. return jso
  18. try:
  19. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  20. # 将 pdf_intermediate_dict 解压
  21. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  22. markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
  23. jso["content"] = markdown_content
  24. logger.info(
  25. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  26. file=sys.stderr,
  27. )
  28. # 把无用的信息清空
  29. jso["doc_layout_result"] = ""
  30. jso["pdf_intermediate_dict"] = ""
  31. jso["pdf_meta"] = ""
  32. except Exception as e:
  33. jso = exception_handler(jso, e)
  34. return jso
  35. def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict:
  36. if debug_mode:
  37. pass
  38. else: # 如果debug没开,则检测是否有needdrop字段
  39. if jso.get("need_drop", False):
  40. book_name = join_path(get_data_source(jso), jso["file_id"])
  41. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  42. jso["dropped"] = True
  43. return jso
  44. try:
  45. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  46. # 将 pdf_intermediate_dict 解压
  47. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  48. # markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  49. markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
  50. jso["content"] = markdown_content
  51. logger.info(
  52. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  53. file=sys.stderr,
  54. )
  55. # 把无用的信息清空
  56. jso["doc_layout_result"] = ""
  57. jso["pdf_intermediate_dict"] = ""
  58. jso["pdf_meta"] = ""
  59. except Exception as e:
  60. jso = exception_handler(jso, e)
  61. return jso
  62. def ocr_pdf_intermediate_dict_to_markdown_with_para_and_pagination(jso: dict, debug_mode=False) -> dict:
  63. if debug_mode:
  64. pass
  65. else: # 如果debug没开,则检测是否有needdrop字段
  66. if jso.get("need_drop", False):
  67. book_name = join_path(get_data_source(jso), jso["file_id"])
  68. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  69. jso["dropped"] = True
  70. return jso
  71. try:
  72. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  73. # 将 pdf_intermediate_dict 解压
  74. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  75. markdown_content = ocr_mk_mm_markdown_with_para_and_pagination(pdf_intermediate_dict)
  76. jso["content"] = markdown_content
  77. logger.info(
  78. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  79. file=sys.stderr,
  80. )
  81. # 把无用的信息清空
  82. # jso["doc_layout_result"] = ""
  83. jso["pdf_intermediate_dict"] = ""
  84. # jso["pdf_meta"] = ""
  85. except Exception as e:
  86. jso = exception_handler(jso, e)
  87. return jso
  88. def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
  89. jso: dict, debug_mode=False
  90. ) -> dict:
  91. if debug_mode:
  92. pass
  93. else: # 如果debug没开,则检测是否有needdrop字段
  94. if jso.get("need_drop", False):
  95. book_name = join_path(get_data_source(jso), jso["file_id"])
  96. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  97. jso["dropped"] = True
  98. return jso
  99. try:
  100. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  101. # 将 pdf_intermediate_dict 解压
  102. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  103. markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
  104. jso["content_ocr"] = markdown_content
  105. logger.info(
  106. f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
  107. file=sys.stderr,
  108. )
  109. # 把无用的信息清空
  110. jso["doc_layout_result"] = ""
  111. jso["pdf_intermediate_dict"] = ""
  112. jso["mid_json_ocr"] = pdf_intermediate_dict
  113. jso["pdf_meta"] = ""
  114. except Exception as e:
  115. jso = exception_handler(jso, e)
  116. return jso
  117. def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> dict:
  118. if debug_mode:
  119. pass
  120. else: # 如果debug没开,则检测是否有needdrop字段
  121. if jso.get("need_drop", False):
  122. book_name = join_path(get_data_source(jso), jso["file_id"])
  123. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  124. jso["dropped"] = True
  125. return jso
  126. try:
  127. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  128. # 将 pdf_intermediate_dict 解压
  129. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  130. standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict)
  131. jso["content_list"] = standard_format
  132. logger.info(
  133. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  134. file=sys.stderr,
  135. )
  136. # 把无用的信息清空
  137. jso["doc_layout_result"] = ""
  138. jso["pdf_intermediate_dict"] = ""
  139. jso["pdf_meta"] = ""
  140. except Exception as e:
  141. jso = exception_handler(jso, e)
  142. return jso
  143. def ocr_pdf_intermediate_dict_to_standard_format_with_para(jso: dict, debug_mode=False) -> dict:
  144. if debug_mode:
  145. pass
  146. else: # 如果debug没开,则检测是否有needdrop字段
  147. if jso.get("need_drop", False):
  148. book_name = join_path(get_data_source(jso), jso["file_id"])
  149. logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
  150. jso["dropped"] = True
  151. return jso
  152. try:
  153. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  154. # 将 pdf_intermediate_dict 解压
  155. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  156. standard_format = make_standard_format_with_para(pdf_intermediate_dict)
  157. jso["content_list"] = standard_format
  158. logger.info(
  159. f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
  160. file=sys.stderr,
  161. )
  162. # 把无用的信息清空
  163. jso["doc_layout_result"] = ""
  164. jso["pdf_intermediate_dict"] = ""
  165. jso["pdf_meta"] = ""
  166. except Exception as e:
  167. jso = exception_handler(jso, e)
  168. return jso