|
|
@@ -304,6 +304,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
# 开始正式逻辑
|
|
|
s3_pdf_path = jso.get("file_location")
|
|
|
s3_config = get_s3_config(s3_pdf_path)
|
|
|
+ pdf_bytes = read_file(s3_pdf_path, s3_config)
|
|
|
model_output_json_list = jso.get("doc_layout_result")
|
|
|
data_source = get_data_source(jso)
|
|
|
file_id = jso.get("file_id")
|
|
|
@@ -341,8 +342,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
file=sys.stderr,
|
|
|
)
|
|
|
pdf_info_dict = parse_pdf_by_model(
|
|
|
- s3_pdf_path,
|
|
|
- s3_config,
|
|
|
+ pdf_bytes,
|
|
|
model_output_json_list,
|
|
|
save_path,
|
|
|
book_name,
|
|
|
@@ -373,18 +373,6 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
return jso
|
|
|
|
|
|
|
|
|
-"""
|
|
|
-统一处理逻辑
|
|
|
-1.先调用parse_pdf对文本类pdf进行处理
|
|
|
-2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
|
|
|
-"""
|
|
|
-
|
|
|
-
|
|
|
-def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
- jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
|
|
|
- jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
|
|
|
- return jso
|
|
|
-
|
|
|
def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
# 检测debug开关
|
|
|
if debug_mode:
|
|
|
@@ -465,6 +453,19 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
|
|
|
return jso
|
|
|
|
|
|
|
|
|
+"""
|
|
|
+统一处理逻辑
|
|
|
+1.先调用parse_pdf对文本类pdf进行处理
|
|
|
+2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
|
|
|
+"""
|
|
|
+
|
|
|
+
|
|
|
+def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
+ jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
|
|
|
+ jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
|
|
|
+ return jso
|
|
|
+
|
|
|
+
|
|
|
# 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
|
|
|
def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
|
|
if not jso.get("need_drop", False):
|