|
@@ -0,0 +1,198 @@
|
|
|
|
|
+"""
|
|
|
|
|
+PDF 文档类型分类工具
|
|
|
|
|
+
|
|
|
|
|
+封装自 MinerU 项目 mineru/utils/pdf_classify.py,作为 ocr_platform 的自有实现。
|
|
|
|
|
+功能:判断 PDF 是否可直接提取文本(txt)或需要 OCR(ocr)。
|
|
|
|
|
+
|
|
|
|
|
+对外接口:
|
|
|
|
|
+ classify(pdf_bytes: bytes) -> str # 'txt' 或 'ocr'
|
|
|
|
|
+
|
|
|
|
|
+说明:
|
|
|
|
|
+ classify() 始终使用本模块的自有实现,以保留对 MinerU 原版的定制修改
|
|
|
|
|
+ (例如 avg_chars >= chars_threshold*4 时跳过图像覆盖率检测,避免含全页水印
|
|
|
|
|
+ 图的文字型 PDF 被误判为 'ocr')。
|
|
|
|
|
+
|
|
|
|
|
+ 内部 helper 函数(get_avg_cleaned_chars_per_page / get_high_image_coverage_ratio
|
|
|
|
|
+ / extract_pages / detect_invalid_chars)优先复用 MinerU 原版,供需要直接调用
|
|
|
|
|
+ helper 的场景使用;_USING_MINERU_HELPERS 标识当前是否使用 MinerU helpers。
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import re
|
|
|
|
|
+from io import BytesIO
|
|
|
|
|
+
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+from loguru import logger
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
+# Helper 函数:优先复用 MinerU 原版(逻辑未修改,保持一致即可)
|
|
|
|
|
+# ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
+
|
|
|
|
|
+try:
|
|
|
|
|
+ from mineru.utils.pdf_classify import (
|
|
|
|
|
+ get_avg_cleaned_chars_per_page,
|
|
|
|
|
+ get_high_image_coverage_ratio,
|
|
|
|
|
+ extract_pages,
|
|
|
|
|
+ detect_invalid_chars,
|
|
|
|
|
+ )
|
|
|
|
|
+ _USING_MINERU_HELPERS = True
|
|
|
|
|
+
|
|
|
|
|
+except ImportError:
|
|
|
|
|
+ _USING_MINERU_HELPERS = False
|
|
|
|
|
+
|
|
|
|
|
+ def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check: int) -> float:
|
|
|
|
|
+ """计算前 pages_to_check 页的平均清理后字符数。"""
|
|
|
|
|
+ cleaned_total = 0
|
|
|
|
|
+ for i in range(pages_to_check):
|
|
|
|
|
+ page = pdf_doc[i]
|
|
|
|
|
+ text = page.get_textpage().get_text_bounded()
|
|
|
|
|
+ cleaned_total += len(re.sub(r'\s+', '', text))
|
|
|
|
|
+ return cleaned_total / pages_to_check
|
|
|
|
|
+
|
|
|
|
|
+ def get_high_image_coverage_ratio(sample_pdf_bytes: bytes, pages_to_check: int) -> float:
|
|
|
|
|
+ """
|
|
|
|
|
+ 计算高图像覆盖率(>= 80%)的页面占比。
|
|
|
|
|
+ 使用 pdfminer 遍历页面布局元素。
|
|
|
|
|
+ """
|
|
|
|
|
+ from pdfminer.pdfparser import PDFParser
|
|
|
|
|
+ from pdfminer.pdfdocument import PDFDocument
|
|
|
|
|
+ from pdfminer.pdfpage import PDFPage
|
|
|
|
|
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
|
|
+ from pdfminer.layout import LAParams, LTImage, LTFigure
|
|
|
|
|
+ from pdfminer.converter import PDFPageAggregator
|
|
|
|
|
+
|
|
|
|
|
+ pdf_stream = BytesIO(sample_pdf_bytes)
|
|
|
|
|
+ parser = PDFParser(pdf_stream)
|
|
|
|
|
+ document = PDFDocument(parser)
|
|
|
|
|
+
|
|
|
|
|
+ if not document.is_extractable:
|
|
|
|
|
+ return 1.0
|
|
|
|
|
+
|
|
|
|
|
+ rsrcmgr = PDFResourceManager()
|
|
|
|
|
+ laparams = LAParams(
|
|
|
|
|
+ line_overlap=0.5, char_margin=2.0, line_margin=0.5,
|
|
|
|
|
+ word_margin=0.1, boxes_flow=None, detect_vertical=False, all_texts=False,
|
|
|
|
|
+ )
|
|
|
|
|
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
|
|
|
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
|
+
|
|
|
|
|
+ high_coverage_pages = 0
|
|
|
|
|
+ page_count = 0
|
|
|
|
|
+
|
|
|
|
|
+ for page in PDFPage.create_pages(document):
|
|
|
|
|
+ if page_count >= pages_to_check:
|
|
|
|
|
+ break
|
|
|
|
|
+ interpreter.process_page(page)
|
|
|
|
|
+ layout = device.get_result()
|
|
|
|
|
+
|
|
|
|
|
+ page_area = layout.width * layout.height
|
|
|
|
|
+ image_area = sum(
|
|
|
|
|
+ el.width * el.height
|
|
|
|
|
+ for el in layout
|
|
|
|
|
+ if isinstance(el, (LTImage, LTFigure))
|
|
|
|
|
+ )
|
|
|
|
|
+ coverage = min(image_area / page_area, 1.0) if page_area > 0 else 0
|
|
|
|
|
+ if coverage >= 0.8:
|
|
|
|
|
+ high_coverage_pages += 1
|
|
|
|
|
+ page_count += 1
|
|
|
|
|
+
|
|
|
|
|
+ pdf_stream.close()
|
|
|
|
|
+ return 0.0 if page_count == 0 else high_coverage_pages / page_count
|
|
|
|
|
+
|
|
|
|
|
+ def extract_pages(src_pdf_bytes: bytes) -> bytes:
|
|
|
|
|
+ """从 PDF 字节数据随机提取最多 10 页,返回新的 PDF 字节数据。"""
|
|
|
|
|
+ import pypdfium2 as pdfium
|
|
|
|
|
+
|
|
|
|
|
+ pdf = pdfium.PdfDocument(src_pdf_bytes)
|
|
|
|
|
+ total_page = len(pdf)
|
|
|
|
|
+ if total_page == 0:
|
|
|
|
|
+ logger.warning("PDF 为空,返回空文档")
|
|
|
|
|
+ return b''
|
|
|
|
|
+
|
|
|
|
|
+ select_count = min(10, total_page)
|
|
|
|
|
+ page_indices = np.random.choice(total_page, select_count, replace=False).tolist()
|
|
|
|
|
+
|
|
|
|
|
+ sample_doc = pdfium.PdfDocument.new()
|
|
|
|
|
+ try:
|
|
|
|
|
+ sample_doc.import_pages(pdf, page_indices)
|
|
|
|
|
+ pdf.close()
|
|
|
|
|
+ buf = BytesIO()
|
|
|
|
|
+ sample_doc.save(buf)
|
|
|
|
|
+ return buf.getvalue()
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ pdf.close()
|
|
|
|
|
+ logger.exception(e)
|
|
|
|
|
+ return b''
|
|
|
|
|
+
|
|
|
|
|
+ def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
|
|
|
|
|
+ """检测 PDF 中是否包含乱码字符((cid:xxx) 占比 > 5%)。"""
|
|
|
|
|
+ from pdfminer.high_level import extract_text
|
|
|
|
|
+ from pdfminer.layout import LAParams
|
|
|
|
|
+
|
|
|
|
|
+ laparams = LAParams(
|
|
|
|
|
+ line_overlap=0.5, char_margin=2.0, line_margin=0.5,
|
|
|
|
|
+ word_margin=0.1, boxes_flow=None, detect_vertical=False, all_texts=False,
|
|
|
|
|
+ )
|
|
|
|
|
+ text = extract_text(pdf_file=BytesIO(sample_pdf_bytes), laparams=laparams)
|
|
|
|
|
+ text = text.replace('\n', '')
|
|
|
|
|
+
|
|
|
|
|
+ cid_pattern = re.compile(r'\(cid:\d+\)')
|
|
|
|
|
+ matches = cid_pattern.findall(text)
|
|
|
|
|
+ cid_count = len(matches)
|
|
|
|
|
+ cid_len = sum(len(m) for m in matches)
|
|
|
|
|
+ text_len = len(text)
|
|
|
|
|
+
|
|
|
|
|
+ if text_len == 0:
|
|
|
|
|
+ return False
|
|
|
|
|
+ cid_radio = cid_count / (cid_count + text_len - cid_len)
|
|
|
|
|
+ return cid_radio > 0.05
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
+# classify:始终使用自有实现(包含对 MinerU 原版的定制修改)
|
|
|
|
|
+# ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
+
|
|
|
|
|
+def classify(pdf_bytes: bytes) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 判断 PDF 文件是可以直接提取文本还是需要 OCR。
|
|
|
|
|
+
|
|
|
|
|
+ 与 MinerU 原版的差异(不修改上游代码):
|
|
|
|
|
+ 检查图像覆盖率之前,若每页平均字符数已 >= chars_threshold * 4,
|
|
|
|
|
+ 则视为确定的文字型 PDF,跳过覆盖率检测。
|
|
|
|
|
+ 典型场景:含全页半透明水印图的银行流水文字 PDF,图像覆盖率接近 100%,
|
|
|
|
|
+ 但每页有大量可提取文字,应分类为 'txt' 而非 'ocr'。
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 'txt' — 可直接提取文本
|
|
|
|
|
+ 'ocr' — 需要 OCR
|
|
|
|
|
+ """
|
|
|
|
|
+ import pypdfium2 as pdfium
|
|
|
|
|
+
|
|
|
|
|
+ sample_pdf_bytes = extract_pages(pdf_bytes)
|
|
|
|
|
+ pdf = pdfium.PdfDocument(sample_pdf_bytes)
|
|
|
|
|
+ try:
|
|
|
|
|
+ page_count = len(pdf)
|
|
|
|
|
+ if page_count == 0:
|
|
|
|
|
+ return 'ocr'
|
|
|
|
|
+
|
|
|
|
|
+ pages_to_check = min(page_count, 10)
|
|
|
|
|
+ chars_threshold = 50
|
|
|
|
|
+
|
|
|
|
|
+ avg_chars = get_avg_cleaned_chars_per_page(pdf, pages_to_check)
|
|
|
|
|
+ if avg_chars < chars_threshold or detect_invalid_chars(sample_pdf_bytes):
|
|
|
|
|
+ return 'ocr'
|
|
|
|
|
+
|
|
|
|
|
+ # 仅在文字数量处于"临界量"时以图像覆盖率辅助判断。
|
|
|
|
|
+ # 若文字数量已远超阈值(>= 4×),视为确定的文字型 PDF,
|
|
|
|
|
+ # 不受背景图(如水印)干扰,直接返回 'txt'。
|
|
|
|
|
+ if avg_chars < chars_threshold * 4 and get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
|
|
|
|
|
+ return 'ocr'
|
|
|
|
|
+
|
|
|
|
|
+ return 'txt'
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"判断 PDF 类型时出错: {e}")
|
|
|
|
|
+ return 'ocr'
|
|
|
|
|
+ finally:
|
|
|
|
|
+ pdf.close()
|
|
|
|
|
+
|