Explorar el Código

feat: 添加 detect_page_type 函数以检测 PDF 页面的类型(文字页或图片页)

zhch158_admin hace 3 días
padre
commit
f2079d9e4f
Se han modificado 1 ficheros con 19 adiciones y 0 borrados
  1. 19 0
      ocr_utils/pdf_text_extraction.py

+ 19 - 0
ocr_utils/pdf_text_extraction.py

@@ -411,3 +411,22 @@ def extract_all_text_blocks_fitz(
         import traceback
         logger.debug(traceback.format_exc())
         return [], 0
+
+
+def detect_page_type(
+    pdf_doc: Any, 
+    page_idx: int,
+    char_threshold: int = 50
+) -> str:
+    """
+    检测PDF指定页是文字页还是图片页
+    
+    基于字符密度的简单可靠方法
+    """
+    try:
+        text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
+        total_chars = sum(len(block.get('text', '')) for block in text_blocks)
+        
+        return 'txt' if total_chars >= char_threshold else 'ocr'
+    except:
+        return 'ocr'