Kaynağa Gözat

feat(新增文字型PDF水印去除预处理): 在文档处理流程中添加文字型PDF水印去除功能,提升文档可搜索性

zhch158_admin 1 hafta önce
ebeveyn
işleme
6514d7f3fd

+ 17 - 1
ocr_tools/universal_doc_parser/core/pipeline_manager_v2_streaming.py

@@ -29,6 +29,7 @@ if str(module_root) not in sys.path:
 # 导入基础类(复用现有实现)
 from .pipeline_manager_v2 import EnhancedDocPipeline
 from ocr_utils import PDFUtils
+from ocr_utils.watermark_utils import scan_pdf_watermark_xobjs, remove_txt_pdf_watermark
 
 # 从 ocr_utils 导入输出格式化器
 try:
@@ -125,8 +126,23 @@ class StreamingDocPipeline(EnhancedDocPipeline):
         try:
             # 1. 加载文档并分类
             dpi = self.config.get('input', {}).get('dpi', 200)
+
+            # 1a. 文字型 PDF 水印预处理(在渲染前去除,保留文字可搜索性)
+            _pdf_bytes_override: Optional[bytes] = None
+            if is_pdf:
+                wm_cfg = self.config.get('input', {}).get('txt_pdf_watermark_removal', {})
+                if wm_cfg.get('enabled', False):
+                    _raw = doc_path.read_bytes()
+                    if scan_pdf_watermark_xobjs(_raw, sample_pages=wm_cfg.get('sample_pages', 3)):
+                        _cleaned = remove_txt_pdf_watermark(_raw)
+                        if _cleaned is not None:
+                            _pdf_bytes_override = _cleaned
+                            logger.info(f"🧹 文字型 PDF 原生去水印完成({doc_path.name})")
+                        else:
+                            logger.debug(f"ℹ️ txt PDF 水印扫描命中但去除返回 None,跳过({doc_path.name})")
+
             images, pdf_type, pdf_doc, renderer_used = PDFUtils.load_and_classify_document(
-                doc_path, dpi=dpi, page_range=page_range
+                doc_path, dpi=dpi, page_range=page_range, pdf_bytes=_pdf_bytes_override
             )
             
             results_summary['metadata']['pdf_type'] = pdf_type