pdf_check.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. from io import BytesIO
  2. import re
  3. import fitz
  4. import numpy as np
  5. from loguru import logger
  6. from pdfminer.high_level import extract_text
  7. def calculate_sample_count(total_page: int, sample_ratio=0.1):
  8. """
  9. 根据总页数和采样率计算采样页面的数量。
  10. """
  11. select_page_cnt = int(total_page * sample_ratio)
  12. if select_page_cnt < 5:
  13. select_page_cnt = min(10, total_page)
  14. elif select_page_cnt > 10:
  15. select_page_cnt = 10
  16. return select_page_cnt
  17. def extract_pages(src_pdf_bytes: bytes):
  18. pdf_docs = fitz.open("pdf", src_pdf_bytes)
  19. total_page = len(pdf_docs)
  20. if total_page == 0:
  21. # 如果PDF没有页面,直接返回空文档
  22. logger.warning("PDF is empty, return empty document")
  23. return fitz.Document()
  24. select_page_cnt = calculate_sample_count(total_page)
  25. page_num = np.random.choice(total_page, select_page_cnt, replace=False)
  26. sample_docs = fitz.Document()
  27. try:
  28. for index in page_num:
  29. sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
  30. except Exception as e:
  31. logger.exception(e)
  32. return sample_docs
  33. def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
  34. """"
  35. 检测PDF中是否包含非法字符
  36. """
  37. '''需要使用'''
  38. sample_docs = extract_pages(src_pdf_bytes)
  39. sample_pdf_bytes = sample_docs.tobytes()
  40. sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
  41. text = extract_text(sample_pdf_file_like_object)
  42. # logger.info(text)
  43. '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
  44. cid_pattern = re.compile(r'\(cid:\d+\)')
  45. matches = cid_pattern.findall(text)
  46. cid_count = len(matches)
  47. text_len = len(text)
  48. logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
  49. if cid_count > 10:
  50. return False # 乱码文档
  51. else:
  52. return True # 正常文档