pdf_check.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import fitz
  2. import numpy as np
  3. from loguru import logger
  4. # import re
  5. # from io import BytesIO
  6. # from pdfminer.high_level import extract_text
  7. def calculate_sample_count(total_page: int):
  8. """
  9. 根据总页数和采样率计算采样页面的数量。
  10. """
  11. select_page_cnt = min(10, total_page)
  12. return select_page_cnt
  13. def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
  14. pdf_docs = fitz.open("pdf", src_pdf_bytes)
  15. total_page = len(pdf_docs)
  16. if total_page == 0:
  17. # 如果PDF没有页面,直接返回空文档
  18. logger.warning("PDF is empty, return empty document")
  19. return fitz.Document()
  20. select_page_cnt = calculate_sample_count(total_page)
  21. page_num = np.random.choice(total_page, select_page_cnt, replace=False)
  22. sample_docs = fitz.Document()
  23. try:
  24. for index in page_num:
  25. sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
  26. except Exception as e:
  27. logger.exception(e)
  28. return sample_docs
  29. # def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
  30. # """"
  31. # 检测PDF中是否包含非法字符
  32. # """
  33. # '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
  34. # sample_docs = extract_pages(src_pdf_bytes)
  35. # sample_pdf_bytes = sample_docs.tobytes()
  36. # sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
  37. # text = extract_text(sample_pdf_file_like_object)
  38. # text = text.replace("\n", "")
  39. # # logger.info(text)
  40. # '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
  41. # cid_pattern = re.compile(r'\(cid:\d+\)')
  42. # matches = cid_pattern.findall(text)
  43. # cid_count = len(matches)
  44. # cid_len = sum(len(match) for match in matches)
  45. # text_len = len(text)
  46. # if text_len == 0:
  47. # cid_chars_radio = 0
  48. # else:
  49. # cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
  50. # logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
  51. # '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
  52. # if cid_chars_radio > 0.05:
  53. # return False # 乱码文档
  54. # else:
  55. # return True # 正常文档
  56. def count_replacement_characters(text: str) -> int:
  57. """
  58. 统计字符串中 0xfffd 字符的数量。
  59. """
  60. return text.count('\ufffd')
  61. def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
  62. sample_docs = extract_pages(src_pdf_bytes)
  63. doc_text = ""
  64. for page in sample_docs:
  65. page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
  66. doc_text += page_text
  67. text_len = len(doc_text)
  68. uffd_count = count_replacement_characters(doc_text)
  69. if text_len == 0:
  70. uffd_chars_radio = 0
  71. else:
  72. uffd_chars_radio = uffd_count / text_len
  73. logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
  74. '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
  75. if uffd_chars_radio > 0.01:
  76. return False # 乱码文档
  77. else:
  78. return True # 正常文档