ocr_cut_image.py 732 B

123456789101112131415161718
  1. from magic_pdf.libs.commons import join_path
  2. from magic_pdf.libs.ocr_content_type import ContentType
  3. from magic_pdf.libs.pdf_image_tools import cut_image
  4. def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
  5. def return_path(type):
  6. return join_path(pdf_bytes_md5, type)
  7. for span in spans:
  8. span_type = span['type']
  9. if span_type == ContentType.Image:
  10. span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
  11. elif span_type == ContentType.Table:
  12. span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)
  13. return spans