ocr_cut_image.py 666 B

12345678910111213141516171819
  1. from magic_pdf.libs.commons import join_path
  2. from magic_pdf.libs.pdf_image_tools import cut_image
  3. def cut_image_and_table(spans, page, page_id, book_name, save_path):
  4. def s3_return_path(type):
  5. return join_path(book_name, type)
  6. def img_save_path(type):
  7. return join_path(save_path, s3_return_path(type))
  8. for span in spans:
  9. span_type = span['type']
  10. if span_type == 'image':
  11. span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
  12. elif span_type == 'table':
  13. span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
  14. return spans