Przeglądaj źródła

feat(新增文字型PDF水印预处理): 在文档加载过程中添加文字型PDF水印去除功能,提升文档可搜索性

zhch158_admin 1 tydzień temu
rodzic
commit
9a0943bdd3

+ 18 - 1
ocr_tools/universal_doc_parser/core/pipeline_manager_v2.py

@@ -50,6 +50,8 @@ except ImportError:
     from layout_utils import LayoutUtils, SpanMatcher
     from element_processors import ElementProcessors
 
+from ocr_utils.watermark_utils import scan_pdf_watermark_xobjs, remove_txt_pdf_watermark
+
 # 从 ocr_tools.ocr_merger 导入 merger 组件
 try:
     from ocr_tools.ocr_merger import TableCellMatcher, TextMatcher
@@ -286,8 +288,23 @@ class EnhancedDocPipeline:
         try:
             # 1. 加载文档并分类
             dpi = self.config.get('input', {}).get('dpi', 200)
+
+            # 1a. 文字型 PDF 水印预处理(在渲染前去除,保留文字可搜索性)
+            _pdf_bytes_override: Optional[bytes] = None
+            if is_pdf:
+                wm_cfg = self.config.get('input', {}).get('txt_pdf_watermark_removal', {})
+                if wm_cfg.get('enabled', False):
+                    _raw = doc_path.read_bytes()
+                    if scan_pdf_watermark_xobjs(_raw, sample_pages=wm_cfg.get('sample_pages', 3)):
+                        _cleaned = remove_txt_pdf_watermark(_raw)
+                        if _cleaned is not None:
+                            _pdf_bytes_override = _cleaned
+                            logger.info(f"🧹 文字型 PDF 原生去水印完成({doc_path.name})")
+                        else:
+                            logger.debug(f"ℹ️ txt PDF 水印扫描命中但去除返回 None,跳过({doc_path.name})")
+
             images, pdf_type, pdf_doc, renderer_used = PDFUtils.load_and_classify_document(
-                doc_path, dpi=dpi, page_range=page_range
+                doc_path, dpi=dpi, page_range=page_range, pdf_bytes=_pdf_bytes_override
             )
             results['metadata']['pdf_type'] = pdf_type
             results['metadata']['page_count'] = len(images)