convert_to_train_format.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. def convert_to_train_format(jso: dict) -> []:
  2. pages = []
  3. for k, v in jso.items():
  4. if not k.startswith("page_"):
  5. continue
  6. page_idx = v["page_idx"]
  7. width, height = v["page_size"]
  8. info = {"page_info": {"page_no": page_idx, "height": height, "width": width}}
  9. bboxes: list[dict] = []
  10. for img_bbox in v["image_bboxes_with_caption"]:
  11. bbox = {"category_id": 1, "bbox": img_bbox["bbox"]}
  12. if "caption" in img_bbox:
  13. bbox["caption_bbox"] = img_bbox["caption"]
  14. bboxes.append(bbox)
  15. for tbl_bbox in v["table_bboxes_with_caption"]:
  16. bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]}
  17. if "caption" in tbl_bbox:
  18. bbox["caption_bbox"] = tbl_bbox["caption"]
  19. bboxes.append(bbox)
  20. for bbox in v["bak_page_no_bboxes"]:
  21. n_bbox = {"category_id": 4, "bbox": bbox}
  22. bboxes.append(n_bbox)
  23. for bbox in v["bak_header_bboxes"]:
  24. n_bbox = {"category_id": 3, "bbox": bbox}
  25. bboxes.append(n_bbox)
  26. for bbox in v["bak_footer_bboxes"]:
  27. n_bbox = {"category_id": 6, "bbox": bbox}
  28. bboxes.append(n_bbox)
  29. # 脚注, 目前没有看到例子
  30. for para in v["para_blocks"]:
  31. if "paras" in para:
  32. paras = para["paras"]
  33. for para_key, para_content in paras.items():
  34. para_bbox = para_content["para_bbox"]
  35. is_para_title = para_content["is_para_title"]
  36. if is_para_title:
  37. n_bbox = {"category_id": 0, "bbox": para_bbox}
  38. else:
  39. n_bbox = {"category_id": 2, "bbox": para_bbox}
  40. bboxes.append(n_bbox)
  41. for inline_equation in v["inline_equations"]:
  42. n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
  43. bboxes.append(n_bbox)
  44. for inter_equation in v["interline_equations"]:
  45. n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
  46. bboxes.append(n_bbox)
  47. for footnote_bbox in v["bak_footer_note_bboxes"]:
  48. n_bbox = {"category_id": 5, "bbox": list(footnote_bbox)}
  49. bboxes.append(n_bbox)
  50. info["bboxes"] = bboxes
  51. info["layout_tree"] = v["layout_bboxes"]
  52. pages.append(info)
  53. return pages