pipeline_txt.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. """
  2. 文本型pdf转化为统一清洗格式
  3. """
  4. # TODO 移动到spark/目录下
  5. from loguru import logger
  6. from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
  7. from magic_pdf.libs.commons import join_path
  8. from magic_pdf.libs.json_compressor import JsonCompressor
  9. from magic_pdf.spark import exception_handler, get_data_source
  10. def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
  11. """
  12. 变成统一的标准格式
  13. """
  14. if debug_mode:
  15. pass
  16. else: # 如果debug没开,则检测是否有needdrop字段
  17. if jso.get("need_drop", False):
  18. book_name = join_path(get_data_source(jso), jso["file_id"])
  19. logger.info(f"book_name is:{book_name} need drop")
  20. jso["dropped"] = True
  21. return jso
  22. try:
  23. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  24. # 将 pdf_intermediate_dict 解压
  25. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  26. standard_format = mk_universal_format(pdf_intermediate_dict)
  27. jso["content_list"] = standard_format
  28. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
  29. # 把无用的信息清空
  30. jso["doc_layout_result"] = ""
  31. jso["pdf_intermediate_dict"] = ""
  32. jso["pdf_meta"] = ""
  33. except Exception as e:
  34. jso = exception_handler(jso, e)
  35. return jso
  36. def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
  37. """
  38. 变成多模态的markdown格式
  39. """
  40. if debug_mode:
  41. pass
  42. else: # 如果debug没开,则检测是否有needdrop字段
  43. if jso.get("need_drop", False):
  44. book_name = join_path(get_data_source(jso), jso["file_id"])
  45. logger.info(f"book_name is:{book_name} need drop")
  46. jso["dropped"] = True
  47. return jso
  48. try:
  49. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  50. # 将 pdf_intermediate_dict 解压
  51. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  52. standard_format = mk_universal_format(pdf_intermediate_dict)
  53. mm_content = mk_mm_markdown(standard_format)
  54. jso["content_list"] = mm_content
  55. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
  56. # 把无用的信息清空
  57. jso["doc_layout_result"] = ""
  58. jso["pdf_intermediate_dict"] = ""
  59. jso["pdf_meta"] = ""
  60. except Exception as e:
  61. jso = exception_handler(jso, e)
  62. return jso