ocr_cut_image.py 897 B

1234567891011121314151617181920
  1. from magic_pdf.libs.commons import join_path
  2. from magic_pdf.libs.ocr_content_type import ContentType
  3. from magic_pdf.libs.pdf_image_tools import cut_image
  4. def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
  5. def s3_return_path(type):
  6. return join_path(book_name, type)
  7. def img_save_path(type):
  8. return join_path(save_path, s3_return_path(type))
  9. for span in spans:
  10. span_type = span['type']
  11. if span_type == ContentType.Image:
  12. span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
  13. elif span_type == ContentType.Table:
  14. span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
  15. return spans