| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- from loguru import logger
- from magic_pdf.libs.drop_reason import DropReason
- def get_data_source(jso: dict):
- data_source = jso.get("data_source")
- if data_source is None:
- data_source = jso.get("file_source")
- return data_source
- def get_data_type(jso: dict):
- data_type = jso.get("data_type")
- if data_type is None:
- data_type = jso.get("file_type")
- return data_type
- def get_bookid(jso: dict):
- book_id = jso.get("bookid")
- if book_id is None:
- book_id = jso.get("original_file_id")
- return book_id
- def exception_handler(jso: dict, e):
- logger.exception(e)
- jso["_need_drop"] = True
- jso["_drop_reason"] = DropReason.Exception
- jso["_exception"] = f"ERROR: {e}"
- return jso
- def get_bookname(jso: dict):
- data_source = get_data_source(jso)
- file_id = jso.get("file_id")
- book_name = f"{data_source}/{file_id}"
- return book_name
- def spark_json_extractor(jso: dict) -> dict:
- """
- 从json中提取数据,返回一个dict
- """
- return {
- "_pdf_type": jso["_pdf_type"],
- "model_list": jso["doc_layout_result"],
- }
|