spark_api.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. from loguru import logger
  2. from magic_pdf.libs.drop_reason import DropReason
  3. def get_data_source(jso: dict):
  4. data_source = jso.get("data_source")
  5. if data_source is None:
  6. data_source = jso.get("file_source")
  7. return data_source
  8. def get_data_type(jso: dict):
  9. data_type = jso.get("data_type")
  10. if data_type is None:
  11. data_type = jso.get("file_type")
  12. return data_type
  13. def get_bookid(jso: dict):
  14. book_id = jso.get("bookid")
  15. if book_id is None:
  16. book_id = jso.get("original_file_id")
  17. return book_id
  18. def exception_handler(jso: dict, e):
  19. logger.exception(e)
  20. jso["_need_drop"] = True
  21. jso["_drop_reason"] = DropReason.Exception
  22. jso["_exception"] = f"ERROR: {e}"
  23. return jso
  24. def get_bookname(jso: dict):
  25. data_source = get_data_source(jso)
  26. file_id = jso.get("file_id")
  27. book_name = f"{data_source}/{file_id}"
  28. return book_name
  29. def spark_json_extractor(jso: dict) -> dict:
  30. """
  31. 从json中提取数据,返回一个dict
  32. """
  33. return {
  34. "_pdf_type": jso["_pdf_type"],
  35. "model_list": jso["doc_layout_result"],
  36. }