pdf_text_tool.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. from typing import List
  2. import math
  3. import pypdfium2 as pdfium
  4. from pdftext.pdf.chars import get_chars, deduplicate_chars
  5. from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
  6. def get_page(
  7. page: pdfium.PdfPage,
  8. quote_loosebox: bool =True,
  9. superscript_height_threshold: float = 0.7,
  10. line_distance_threshold: float = 0.1,
  11. ) -> dict:
  12. textpage = page.get_textpage()
  13. page_bbox: List[float] = page.get_bbox()
  14. page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
  15. page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
  16. page_rotation = 0
  17. try:
  18. page_rotation = page.get_rotation()
  19. except:
  20. pass
  21. chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
  22. spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
  23. lines = get_lines(spans)
  24. assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
  25. blocks = get_blocks(lines)
  26. page = {
  27. "bbox": page_bbox,
  28. "width": page_width,
  29. "height": page_height,
  30. "rotation": page_rotation,
  31. "blocks": blocks
  32. }
  33. return page