|
|
@@ -411,3 +411,22 @@ def extract_all_text_blocks_fitz(
|
|
|
import traceback
|
|
|
logger.debug(traceback.format_exc())
|
|
|
return [], 0
|
|
|
+
|
|
|
+
|
|
|
+def detect_page_type(
|
|
|
+ pdf_doc: Any,
|
|
|
+ page_idx: int,
|
|
|
+ char_threshold: int = 50
|
|
|
+) -> str:
|
|
|
+ """
|
|
|
+ 检测PDF指定页是文字页还是图片页
|
|
|
+
|
|
|
+ 基于字符密度的简单可靠方法
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
|
|
|
+ total_chars = sum(len(block.get('text', '')) for block in text_blocks)
|
|
|
+
|
|
|
+ return 'txt' if total_chars >= char_threshold else 'ocr'
|
|
|
+ except:
|
|
|
+ return 'ocr'
|