|
|
@@ -41,6 +41,7 @@ from .pdf_text_extraction import (
|
|
|
extract_all_text_blocks,
|
|
|
extract_all_text_blocks_pypdfium2,
|
|
|
extract_all_text_blocks_fitz,
|
|
|
+ detect_page_type,
|
|
|
)
|
|
|
|
|
|
from .pdf_image_rendering import (
|
|
|
@@ -371,8 +372,20 @@ class PDFUtils:
|
|
|
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
|
)
|
|
|
|
|
|
- # ========================================================================
|
|
|
- # 其他功能
|
|
|
+ @staticmethod
|
|
|
+ def detect_page_type(
|
|
|
+ pdf_doc: Any,
|
|
|
+ page_idx: int,
|
|
|
+ char_threshold: int = 50
|
|
|
+ ) -> str:
|
|
|
+ """
|
|
|
+ 检测页面类型(文本PDF或扫描OCR)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 页面类型:'txt' 或 'ocr'
|
|
|
+ """
|
|
|
+ return detect_page_type(pdf_doc, page_idx, char_threshold)
|
|
|
+
|
|
|
# ========================================================================
|
|
|
# 其他功能
|
|
|
# ========================================================================
|