|
|
@@ -23,17 +23,11 @@ from loguru import logger
|
|
|
|
|
|
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
|
|
from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
|
|
|
+from magic_pdf.spark.base import exception_handler, get_data_source
|
|
|
from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
|
|
|
from app.common.s3 import get_s3_config, get_s3_client
|
|
|
|
|
|
|
|
|
-def exception_handler(jso: dict, e):
|
|
|
- logger.exception(e)
|
|
|
- jso["need_drop"] = True
|
|
|
- jso["drop_reason"] = DropReason.Exception
|
|
|
- jso["exception"] = f"ERROR: {e}"
|
|
|
- return jso
|
|
|
-
|
|
|
|
|
|
def get_data_type(jso: dict):
|
|
|
data_type = jso.get("data_type")
|
|
|
@@ -49,13 +43,6 @@ def get_bookid(jso: dict):
|
|
|
return book_id
|
|
|
|
|
|
|
|
|
-def get_data_source(jso: dict):
|
|
|
- data_source = jso.get("data_source")
|
|
|
- if data_source is None:
|
|
|
- data_source = jso.get("file_source")
|
|
|
- return data_source
|
|
|
-
|
|
|
-
|
|
|
def meta_scan(jso: dict, doc_layout_check=True) -> dict:
|
|
|
s3_pdf_path = jso.get("file_location")
|
|
|
s3_config = get_s3_config(s3_pdf_path)
|