瀏覽代碼

add garbled_rate too large process logic

赵小蒙 1 年之前
父節點
當前提交
7d04ed6e78
共有 1 個文件被更改,包括 17 次插入3 次删除
  1. 17 3
      magic_pdf/user_api.py

+ 17 - 3
magic_pdf/user_api.py

@@ -78,9 +78,23 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
             return None
 
     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
-
-    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
-        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+    text_all = ""
+    for page_dict in pdf_info_dict['pdf_info']:
+        for para_block in page_dict['para_blocks']:
+            if para_block['type'] in ['title', 'text']:
+                for line in para_block['lines']:
+                    for span in line['spans']:
+                        text_all += span['content']
+
+    def calculate_garbled_rate(text):
+        printable = sum(1 for c in text if c.isprintable())
+        total = len(text)
+        if total == 0:
+            return 0  # 避免除以零的错误
+        return (total - printable) / total
+
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or calculate_garbled_rate(text_all) < 0.5:
+        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")