Просмотр исходного кода

feat: 添加 detect_page_type 方法以检测 PDF 页面类型(文本或扫描OCR)

zhch158_admin 1 день назад
Родитель
Сommit
c11f2ea045
1 измененных файлов с 15 добавлено и 2 удалено
  1. 15 2
      ocr_utils/pdf_utils.py

+ 15 - 2
ocr_utils/pdf_utils.py

@@ -41,6 +41,7 @@ from .pdf_text_extraction import (
     extract_all_text_blocks,
     extract_all_text_blocks_pypdfium2,
     extract_all_text_blocks_fitz,
+    detect_page_type,
 )
 
 from .pdf_image_rendering import (
@@ -371,8 +372,20 @@ class PDFUtils:
             pdf_bytes, dpi, start_page_id, end_page_id, image_type
         )
     
-    # ========================================================================
-    # 其他功能
+    @staticmethod
+    def detect_page_type(
+        pdf_doc: Any, 
+        page_idx: int,
+        char_threshold: int = 50
+    ) -> str:
+        """
+        检测页面类型(文本PDF或扫描OCR)
+        
+        Returns:
+            页面类型:'txt' 或 'ocr'
+        """
+        return detect_page_type(pdf_doc, page_idx, char_threshold)
+
     # ========================================================================
     # 其他功能
     # ========================================================================