| 12345678910111213141516171819202122232425262728293031323334353637383940 |
- from typing import List
- import math
- import pypdfium2 as pdfium
- from pdftext.pdf.chars import get_chars, deduplicate_chars
- from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
- def get_page(
- page: pdfium.PdfPage,
- quote_loosebox: bool =True,
- superscript_height_threshold: float = 0.7,
- line_distance_threshold: float = 0.1,
- ) -> dict:
- textpage = page.get_textpage()
- page_bbox: List[float] = page.get_bbox()
- page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
- page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
- page_rotation = 0
- try:
- page_rotation = page.get_rotation()
- except:
- pass
- chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
- spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
- lines = get_lines(spans)
- assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
- blocks = get_blocks(lines)
- page = {
- "bbox": page_bbox,
- "width": page_width,
- "height": page_height,
- "rotation": page_rotation,
- "blocks": blocks
- }
- return page
|