瀏覽代碼

ocr后不需要再次检测need_drop,且ocr_dropped_parse_pdf逻辑后需要将need_drop置为false

赵小蒙 1 年之前
父節點
當前提交
f5b9cff4c4
共有 1 個文件被更改,包括 27 次插入68 次删除
  1. 27 68
      magic_pdf/pipeline.py

+ 27 - 68
magic_pdf/pipeline.py

@@ -313,6 +313,25 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
             jso = exception_handler(jso, e)
     return jso
 
+'''
+统一处理逻辑
+1.先调用parse_pdf对文本类pdf进行处理
+2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
+'''
+def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
+    jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
+    jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
+    return jso
+
+# 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
+def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
+    if not jso.get('need_drop', False):
+        return jso
+    else:
+        jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
+        jso['need_drop'] = False
+        return jso
+
 
 def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
     # 检测debug开关
@@ -322,6 +341,11 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
         if jso.get('need_drop', False):
             return jso
 
+    jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
+    return jso
+
+
+def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict:
     s3_pdf_path = jso.get('file_location')
     s3_config = get_s3_config(s3_pdf_path)
     model_output_json_list = jso.get('doc_layout_result')
@@ -345,17 +369,12 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
             start_page_id=start_page_id,
             debug_mode=debug_mode
         )
-        if pdf_info_dict.get('need_drop', False):  # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
-            jso['need_drop'] = True
-            jso['drop_reason'] = pdf_info_dict["drop_reason"]
-        else:  # 正常返回,将 pdf_info_dict 压缩并存储
-            pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
-            jso['pdf_intermediate_dict'] = pdf_info_dict
+        pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
+        jso['pdf_intermediate_dict'] = pdf_info_dict
         end_time = time.time()  # 记录完成时间
         parse_time = int(end_time - start_time)  # 计算执行时间
         # 解析完成后打印一下book_name和耗时
-        logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
-                    file=sys.stderr)
+        logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", file=sys.stderr)
         jso['parse_time'] = parse_time
     except Exception as e:
         jso = exception_handler(jso, e)
@@ -414,65 +433,5 @@ def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) ->
     return jso
 
 
-'''
-统一处理逻辑
-1.先调用parse_pdf对文本类pdf进行处理
-2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
-'''
-def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
-    jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
-    jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
-    return jso
-
-
-def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
-    # 检测debug开关
-    if debug_mode:
-        pass
-    else:  # 如果debug没开,则检测是否有needdrop字段
-        if not jso.get('need_drop', False):
-            return jso
-        else:
-            s3_pdf_path = jso.get('file_location')
-            s3_config = get_s3_config(s3_pdf_path)
-            model_output_json_list = jso.get('doc_layout_result')
-            data_source = get_data_source(jso)
-            file_id = jso.get('file_id')
-            book_name = f"{data_source}/{file_id}"
-            try:
-                save_path = s3_image_save_path
-                image_s3_config = get_s3_config(save_path)
-                start_time = time.time()  # 记录开始时间
-                # 先打印一下book_name和解析开始的时间
-                logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
-                pdf_info_dict = parse_pdf_by_ocr(
-                    s3_pdf_path,
-                    s3_config,
-                    model_output_json_list,
-                    save_path,
-                    book_name,
-                    pdf_model_profile=None,
-                    image_s3_config=image_s3_config,
-                    start_page_id=start_page_id,
-                    debug_mode=debug_mode
-                )
-                if pdf_info_dict.get('need_drop', False):  # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
-                    jso['need_drop'] = True
-                    jso['drop_reason'] = pdf_info_dict["drop_reason"]
-                else:  # 正常返回,将 pdf_info_dict 压缩并存储
-                    pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
-                    jso['pdf_intermediate_dict'] = pdf_info_dict
-                end_time = time.time()  # 记录完成时间
-                parse_time = int(end_time - start_time)  # 计算执行时间
-                # 解析完成后打印一下book_name和耗时
-                logger.info(
-                    f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
-                    file=sys.stderr)
-                jso['parse_time'] = parse_time
-            except Exception as e:
-                jso = exception_handler(jso, e)
-                return jso
-
-
 if __name__ == "__main__":
     pass