3 miesięcy temu · 8032c96d96
--- a/ocr_utils/pdf_classify.py
+++ b/ocr_utils/pdf_classify.py
@@ -0,0 +1,198 @@
 
															+"""
														
 
															+PDF 文档类型分类工具
														
 
															+
														
 
															+封装自 MinerU 项目 mineru/utils/pdf_classify.py，作为 ocr_platform 的自有实现。
														
 
															+功能：判断 PDF 是否可直接提取文本（txt）或需要 OCR（ocr）。
														
 
															+
														
 
															+对外接口：
														
 
															+    classify(pdf_bytes: bytes) -> str   # 'txt' 或 'ocr'
														
 
															+
														
 
															+说明：
														
 
															+    classify() 始终使用本模块的自有实现，以保留对 MinerU 原版的定制修改
														
 
															+    （例如 avg_chars >= chars_threshold*4 时跳过图像覆盖率检测，避免含全页水印
														
 
															+    图的文字型 PDF 被误判为 'ocr'）。
														
 
															+
														
 
															+    内部 helper 函数（get_avg_cleaned_chars_per_page / get_high_image_coverage_ratio
														
 
															+    / extract_pages / detect_invalid_chars）优先复用 MinerU 原版，供需要直接调用
														
 
															+    helper 的场景使用；_USING_MINERU_HELPERS 标识当前是否使用 MinerU helpers。
														
 
															+"""
														
 
															+
														
 
															+import re
														
 
															+from io import BytesIO
														
 
															+
														
 
															+import numpy as np
														
 
															+from loguru import logger
														
 
															+
														
 
															+
														
 
															+# ──────────────────────────────────────────────────────────────────────────────
														
 
															+# Helper 函数：优先复用 MinerU 原版（逻辑未修改，保持一致即可）
														
 
															+# ──────────────────────────────────────────────────────────────────────────────
														
 
															+
														
 
															+try:
														
 
															+    from mineru.utils.pdf_classify import (
														
 
															+        get_avg_cleaned_chars_per_page,
														
 
															+        get_high_image_coverage_ratio,
														
 
															+        extract_pages,
														
 
															+        detect_invalid_chars,
														
 
															+    )
														
 
															+    _USING_MINERU_HELPERS = True
														
 
															+
														
 
															+except ImportError:
														
 
															+    _USING_MINERU_HELPERS = False
														
 
															+
														
 
															+    def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check: int) -> float:
														
 
															+        """计算前 pages_to_check 页的平均清理后字符数。"""
														
 
															+        cleaned_total = 0
														
 
															+        for i in range(pages_to_check):
														
 
															+            page = pdf_doc[i]
														
 
															+            text = page.get_textpage().get_text_bounded()
														
 
															+            cleaned_total += len(re.sub(r'\s+', '', text))
														
 
															+        return cleaned_total / pages_to_check
														
 
															+
														
 
															+    def get_high_image_coverage_ratio(sample_pdf_bytes: bytes, pages_to_check: int) -> float:
														
 
															+        """
														
 
															+        计算高图像覆盖率（>= 80%）的页面占比。
														
 
															+        使用 pdfminer 遍历页面布局元素。
														
 
															+        """
														
 
															+        from pdfminer.pdfparser import PDFParser
														
 
															+        from pdfminer.pdfdocument import PDFDocument
														
 
															+        from pdfminer.pdfpage import PDFPage
														
 
															+        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
														
 
															+        from pdfminer.layout import LAParams, LTImage, LTFigure
														
 
															+        from pdfminer.converter import PDFPageAggregator
														
 
															+
														
 
															+        pdf_stream = BytesIO(sample_pdf_bytes)
														
 
															+        parser = PDFParser(pdf_stream)
														
 
															+        document = PDFDocument(parser)
														
 
															+
														
 
															+        if not document.is_extractable:
														
 
															+            return 1.0
														
 
															+
														
 
															+        rsrcmgr = PDFResourceManager()
														
 
															+        laparams = LAParams(
														
 
															+            line_overlap=0.5, char_margin=2.0, line_margin=0.5,
														
 
															+            word_margin=0.1, boxes_flow=None, detect_vertical=False, all_texts=False,
														
 
															+        )
														
 
															+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
														
 
															+        interpreter = PDFPageInterpreter(rsrcmgr, device)
														
 
															+
														
 
															+        high_coverage_pages = 0
														
 
															+        page_count = 0
														
 
															+
														
 
															+        for page in PDFPage.create_pages(document):
														
 
															+            if page_count >= pages_to_check:
														
 
															+                break
														
 
															+            interpreter.process_page(page)
														
 
															+            layout = device.get_result()
														
 
															+
														
 
															+            page_area = layout.width * layout.height
														
 
															+            image_area = sum(
														
 
															+                el.width * el.height
														
 
															+                for el in layout
														
 
															+                if isinstance(el, (LTImage, LTFigure))
														
 
															+            )
														
 
															+            coverage = min(image_area / page_area, 1.0) if page_area > 0 else 0
														
 
															+            if coverage >= 0.8:
														
 
															+                high_coverage_pages += 1
														
 
															+            page_count += 1
														
 
															+
														
 
															+        pdf_stream.close()
														
 
															+        return 0.0 if page_count == 0 else high_coverage_pages / page_count
														
 
															+
														
 
															+    def extract_pages(src_pdf_bytes: bytes) -> bytes:
														
 
															+        """从 PDF 字节数据随机提取最多 10 页，返回新的 PDF 字节数据。"""
														
 
															+        import pypdfium2 as pdfium
														
 
															+
														
 
															+        pdf = pdfium.PdfDocument(src_pdf_bytes)
														
 
															+        total_page = len(pdf)
														
 
															+        if total_page == 0:
														
 
															+            logger.warning("PDF 为空，返回空文档")
														
 
															+            return b''
														
 
															+
														
 
															+        select_count = min(10, total_page)
														
 
															+        page_indices = np.random.choice(total_page, select_count, replace=False).tolist()
														
 
															+
														
 
															+        sample_doc = pdfium.PdfDocument.new()
														
 
															+        try:
														
 
															+            sample_doc.import_pages(pdf, page_indices)
														
 
															+            pdf.close()
														
 
															+            buf = BytesIO()
														
 
															+            sample_doc.save(buf)
														
 
															+            return buf.getvalue()
														
 
															+        except Exception as e:
														
 
															+            pdf.close()
														
 
															+            logger.exception(e)
														
 
															+            return b''
														
 
															+
														
 
															+    def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
														
 
															+        """检测 PDF 中是否包含乱码字符（(cid:xxx) 占比 > 5%）。"""
														
 
															+        from pdfminer.high_level import extract_text
														
 
															+        from pdfminer.layout import LAParams
														
 
															+
														
 
															+        laparams = LAParams(
														
 
															+            line_overlap=0.5, char_margin=2.0, line_margin=0.5,
														
 
															+            word_margin=0.1, boxes_flow=None, detect_vertical=False, all_texts=False,
														
 
															+        )
														
 
															+        text = extract_text(pdf_file=BytesIO(sample_pdf_bytes), laparams=laparams)
														
 
															+        text = text.replace('\n', '')
														
 
															+
														
 
															+        cid_pattern = re.compile(r'\(cid:\d+\)')
														
 
															+        matches = cid_pattern.findall(text)
														
 
															+        cid_count = len(matches)
														
 
															+        cid_len = sum(len(m) for m in matches)
														
 
															+        text_len = len(text)
														
 
															+
														
 
															+        if text_len == 0:
														
 
															+            return False
														
 
															+        cid_radio = cid_count / (cid_count + text_len - cid_len)
														
 
															+        return cid_radio > 0.05
														
 
															+
														
 
															+
														
 
															+# ──────────────────────────────────────────────────────────────────────────────
														
 
															+# classify：始终使用自有实现（包含对 MinerU 原版的定制修改）
														
 
															+# ──────────────────────────────────────────────────────────────────────────────
														
 
															+
														
 
															+def classify(pdf_bytes: bytes) -> str:
														
 
															+    """
														
 
															+    判断 PDF 文件是可以直接提取文本还是需要 OCR。
														
 
															+
														
 
															+    与 MinerU 原版的差异（不修改上游代码）：
														
 
															+        检查图像覆盖率之前，若每页平均字符数已 >= chars_threshold * 4，
														
 
															+        则视为确定的文字型 PDF，跳过覆盖率检测。
														
 
															+        典型场景：含全页半透明水印图的银行流水文字 PDF，图像覆盖率接近 100%，
														
 
															+        但每页有大量可提取文字，应分类为 'txt' 而非 'ocr'。
														
 
															+
														
 
															+    Returns:
														
 
															+        'txt' — 可直接提取文本
														
 
															+        'ocr' — 需要 OCR
														
 
															+    """
														
 
															+    import pypdfium2 as pdfium
														
 
															+
														
 
															+    sample_pdf_bytes = extract_pages(pdf_bytes)
														
 
															+    pdf = pdfium.PdfDocument(sample_pdf_bytes)
														
 
															+    try:
														
 
															+        page_count = len(pdf)
														
 
															+        if page_count == 0:
														
 
															+            return 'ocr'
														
 
															+
														
 
															+        pages_to_check = min(page_count, 10)
														
 
															+        chars_threshold = 50
														
 
															+
														
 
															+        avg_chars = get_avg_cleaned_chars_per_page(pdf, pages_to_check)
														
 
															+        if avg_chars < chars_threshold or detect_invalid_chars(sample_pdf_bytes):
														
 
															+            return 'ocr'
														
 
															+
														
 
															+        # 仅在文字数量处于"临界量"时以图像覆盖率辅助判断。
														
 
															+        # 若文字数量已远超阈值（>= 4×），视为确定的文字型 PDF，
														
 
															+        # 不受背景图（如水印）干扰，直接返回 'txt'。
														
 
															+        if avg_chars < chars_threshold * 4 and get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
														
 
															+            return 'ocr'
														
 
															+
														
 
															+        return 'txt'
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"判断 PDF 类型时出错: {e}")
														
 
															+        return 'ocr'
														
 
															+    finally:
														
 
															+        pdf.close()
														
 
															+