|
|
@@ -50,6 +50,8 @@ except ImportError:
|
|
|
from layout_utils import LayoutUtils, SpanMatcher
|
|
|
from element_processors import ElementProcessors
|
|
|
|
|
|
+from ocr_utils.watermark_utils import scan_pdf_watermark_xobjs, remove_txt_pdf_watermark
|
|
|
+
|
|
|
# 从 ocr_tools.ocr_merger 导入 merger 组件
|
|
|
try:
|
|
|
from ocr_tools.ocr_merger import TableCellMatcher, TextMatcher
|
|
|
@@ -286,8 +288,23 @@ class EnhancedDocPipeline:
|
|
|
try:
|
|
|
# 1. 加载文档并分类
|
|
|
dpi = self.config.get('input', {}).get('dpi', 200)
|
|
|
+
|
|
|
+ # 1a. 文字型 PDF 水印预处理(在渲染前去除,保留文字可搜索性)
|
|
|
+ _pdf_bytes_override: Optional[bytes] = None
|
|
|
+ if is_pdf:
|
|
|
+ wm_cfg = self.config.get('input', {}).get('txt_pdf_watermark_removal', {})
|
|
|
+ if wm_cfg.get('enabled', False):
|
|
|
+ _raw = doc_path.read_bytes()
|
|
|
+ if scan_pdf_watermark_xobjs(_raw, sample_pages=wm_cfg.get('sample_pages', 3)):
|
|
|
+ _cleaned = remove_txt_pdf_watermark(_raw)
|
|
|
+ if _cleaned is not None:
|
|
|
+ _pdf_bytes_override = _cleaned
|
|
|
+ logger.info(f"🧹 文字型 PDF 原生去水印完成({doc_path.name})")
|
|
|
+ else:
|
|
|
+ logger.debug(f"ℹ️ txt PDF 水印扫描命中但去除返回 None,跳过({doc_path.name})")
|
|
|
+
|
|
|
images, pdf_type, pdf_doc, renderer_used = PDFUtils.load_and_classify_document(
|
|
|
- doc_path, dpi=dpi, page_range=page_range
|
|
|
+ doc_path, dpi=dpi, page_range=page_range, pdf_bytes=_pdf_bytes_override
|
|
|
)
|
|
|
results['metadata']['pdf_type'] = pdf_type
|
|
|
results['metadata']['page_count'] = len(images)
|