Ver código fonte

classify后在jso根层级添加_pdf_type标识,同时取消对非文本类pdf的drop

赵小蒙 1 ano atrás
pai
commit
34bde6d8ec
1 arquivos alterados com 4 adições e 2 exclusões
  1. 4 2
      magic_pdf/pipeline.py

+ 4 - 2
magic_pdf/pipeline.py

@@ -130,6 +130,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
             classify_time = int(time.time() - start_time)  # 计算执行时间
             if is_text_pdf:
                 pdf_meta["is_text_pdf"] = is_text_pdf
+                jso["_pdf_type"] = "TXT"
                 jso["pdf_meta"] = pdf_meta
                 jso["classify_time"] = classify_time
                 # print(json.dumps(pdf_meta, ensure_ascii=False))
@@ -144,10 +145,11 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
             else:
                 # 先不drop
                 pdf_meta["is_text_pdf"] = is_text_pdf
+                jso["_pdf_type"] = "OCR"
                 jso["pdf_meta"] = pdf_meta
                 jso["classify_time"] = classify_time
-                jso["need_drop"] = True
-                jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
+                # jso["need_drop"] = True
+                # jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
                 extra_info = {"classify_rules": []}
                 for condition, result in results.items():
                     if not result: