|
@@ -1,9 +1,9 @@
|
|
|
-from io import BytesIO
|
|
|
|
|
-import re
|
|
|
|
|
import fitz
|
|
import fitz
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
-from pdfminer.high_level import extract_text
|
|
|
|
|
|
|
+# import re
|
|
|
|
|
+# from io import BytesIO
|
|
|
|
|
+# from pdfminer.high_level import extract_text
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_sample_count(total_page: int):
|
|
def calculate_sample_count(total_page: int):
|
|
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
|
|
|
return select_page_cnt
|
|
return select_page_cnt
|
|
|
|
|
|
|
|
|
|
|
|
|
-def extract_pages(src_pdf_bytes: bytes):
|
|
|
|
|
|
|
+def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
|
|
|
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
|
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
|
|
total_page = len(pdf_docs)
|
|
total_page = len(pdf_docs)
|
|
|
if total_page == 0:
|
|
if total_page == 0:
|
|
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
|
|
|
return sample_docs
|
|
return sample_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
-def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
|
|
|
|
- """"
|
|
|
|
|
- 检测PDF中是否包含非法字符
|
|
|
|
|
|
|
+# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
|
|
|
|
+# """"
|
|
|
|
|
+# 检测PDF中是否包含非法字符
|
|
|
|
|
+# """
|
|
|
|
|
+# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
|
|
|
|
+# sample_docs = extract_pages(src_pdf_bytes)
|
|
|
|
|
+# sample_pdf_bytes = sample_docs.tobytes()
|
|
|
|
|
+# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
|
|
|
|
+# text = extract_text(sample_pdf_file_like_object)
|
|
|
|
|
+# text = text.replace("\n", "")
|
|
|
|
|
+# # logger.info(text)
|
|
|
|
|
+# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
|
|
|
|
+# cid_pattern = re.compile(r'\(cid:\d+\)')
|
|
|
|
|
+# matches = cid_pattern.findall(text)
|
|
|
|
|
+# cid_count = len(matches)
|
|
|
|
|
+# cid_len = sum(len(match) for match in matches)
|
|
|
|
|
+# text_len = len(text)
|
|
|
|
|
+# if text_len == 0:
|
|
|
|
|
+# cid_chars_radio = 0
|
|
|
|
|
+# else:
|
|
|
|
|
+# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
|
|
|
|
+# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
|
|
|
|
+# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
|
|
|
|
+# if cid_chars_radio > 0.05:
|
|
|
|
|
+# return False # 乱码文档
|
|
|
|
|
+# else:
|
|
|
|
|
+# return True # 正常文档
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def count_replacement_characters(text: str) -> int:
|
|
|
|
|
+ """
|
|
|
|
|
+ 统计字符串中 0xfffd 字符的数量。
|
|
|
"""
|
|
"""
|
|
|
- '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
|
|
|
|
|
|
+ return text.count('\ufffd')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
|
|
|
sample_docs = extract_pages(src_pdf_bytes)
|
|
sample_docs = extract_pages(src_pdf_bytes)
|
|
|
- sample_pdf_bytes = sample_docs.tobytes()
|
|
|
|
|
- sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
|
|
|
|
- text = extract_text(sample_pdf_file_like_object)
|
|
|
|
|
- text = text.replace("\n", "")
|
|
|
|
|
- # logger.info(text)
|
|
|
|
|
- '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
|
|
|
|
- cid_pattern = re.compile(r'\(cid:\d+\)')
|
|
|
|
|
- matches = cid_pattern.findall(text)
|
|
|
|
|
- cid_count = len(matches)
|
|
|
|
|
- cid_len = sum(len(match) for match in matches)
|
|
|
|
|
- text_len = len(text)
|
|
|
|
|
|
|
+ doc_text = ""
|
|
|
|
|
+ for page in sample_docs:
|
|
|
|
|
+ page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
|
|
|
|
|
+ doc_text += page_text
|
|
|
|
|
+ text_len = len(doc_text)
|
|
|
|
|
+ uffd_count = count_replacement_characters(doc_text)
|
|
|
if text_len == 0:
|
|
if text_len == 0:
|
|
|
- cid_chars_radio = 0
|
|
|
|
|
|
|
+ uffd_chars_radio = 0
|
|
|
else:
|
|
else:
|
|
|
- cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
|
|
|
|
- logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
|
|
|
|
- '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
|
|
|
|
- if cid_chars_radio > 0.05:
|
|
|
|
|
|
|
+ uffd_chars_radio = uffd_count / text_len
|
|
|
|
|
+ logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
|
|
|
|
|
+ '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
|
|
|
|
|
+ if uffd_chars_radio > 0.01:
|
|
|
return False # 乱码文档
|
|
return False # 乱码文档
|
|
|
else:
|
|
else:
|
|
|
- return True # 正常文档
|
|
|
|
|
|
|
+ return True # 正常文档
|