pipeline_txt.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. """
  2. 文本型pdf转化为统一清洗格式
  3. """
  4. from loguru import logger
  5. from magic_pdf.dict2md.mkcontent import mk_universal_format
  6. from magic_pdf.libs.commons import join_path
  7. from magic_pdf.libs.json_compressor import JsonCompressor
  8. from magic_pdf.spark.base import exception_handler, get_data_source
  9. def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
  10. if debug_mode:
  11. pass
  12. else: # 如果debug没开,则检测是否有needdrop字段
  13. if jso.get("need_drop", False):
  14. book_name = join_path(get_data_source(jso), jso["file_id"])
  15. logger.info(f"book_name is:{book_name} need drop")
  16. jso["dropped"] = True
  17. return jso
  18. try:
  19. pdf_intermediate_dict = jso["pdf_intermediate_dict"]
  20. # 将 pdf_intermediate_dict 解压
  21. pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
  22. standard_format = mk_universal_format(pdf_intermediate_dict)
  23. jso["content_list"] = standard_format
  24. logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
  25. # 把无用的信息清空
  26. jso["doc_layout_result"] = ""
  27. jso["pdf_intermediate_dict"] = ""
  28. jso["pdf_meta"] = ""
  29. except Exception as e:
  30. jso = exception_handler(jso, e)
  31. return jso