magic_model.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import json
  2. from magic_pdf.libs.commons import fitz
  3. from loguru import logger
  4. from magic_pdf.libs.commons import join_path
  5. from magic_pdf.libs.coordinate_transform import get_scale_ratio
  6. from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
  7. from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
  8. class MagicModel():
  9. """
  10. 每个函数没有得到元素的时候返回空list
  11. """
  12. def __fix_axis(self):
  13. for model_page_info in self.__model_list:
  14. page_no = model_page_info['page_info']['page_no']
  15. horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(model_page_info, self.__docs[page_no])
  16. layout_dets = model_page_info["layout_dets"]
  17. for layout_det in layout_dets:
  18. x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
  19. bbox = [
  20. int(x0 / horizontal_scale_ratio),
  21. int(y0 / vertical_scale_ratio),
  22. int(x1 / horizontal_scale_ratio),
  23. int(y1 / vertical_scale_ratio),
  24. ]
  25. layout_det["bbox"] = bbox
  26. def __init__(self, model_list: list, docs: fitz.Document):
  27. self.__model_list = model_list
  28. self.__docs = docs
  29. self.__fix_axis()
  30. def get_imgs(self, page_no: int): # @许瑞
  31. image_block = {
  32. }
  33. image_block['bbox'] = [x0, y0, x1, y1] # 计算出来
  34. image_block['img_body_bbox'] = [x0, y0, x1, y1]
  35. image_blcok['img_caption_bbox'] = [x0, y0, x1, y1] # 如果没有就是None,但是保证key存在
  36. image_blcok['img_caption_text'] = [x0, y0, x1, y1] # 如果没有就是空字符串,但是保证key存在
  37. return [image_block, ]
  38. def get_tables(self, page_no: int) -> list: # 3个坐标, caption, table主体,table-note
  39. pass # 许瑞, 结构和image一样
  40. def get_equations(self, page_no: int) -> list: # 有坐标,也有字
  41. return inline_equations, interline_equations # @凯文
  42. def get_discarded(self, page_no: int) -> list: # 自研模型,只有坐标
  43. pass # @凯文
  44. def get_text_blocks(self, page_no: int) -> list: # 自研模型搞的,只有坐标,没有字
  45. pass # @凯文
  46. def get_title_blocks(self, page_no: int) -> list: # 自研模型,只有坐标,没字
  47. pass # @凯文
  48. def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标
  49. pass # @小蒙
  50. def get_ocr_spans(self, page_no: int) -> list:
  51. pass # @小蒙
  52. if __name__ == '__main__':
  53. drw = DiskReaderWriter(r"D:/project/20231108code-clean")
  54. pdf_file_path = r"linshixuqiu\19983-00.pdf"
  55. model_file_path = r"linshixuqiu\19983-00_new.json"
  56. pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
  57. model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
  58. model_list = json.loads(model_json_txt)
  59. write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
  60. img_bucket_path = "imgs"
  61. img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
  62. pdf_docs = fitz.open("pdf", pdf_bytes)
  63. magic_model = MagicModel(model_list, pdf_docs)