draw.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. from magic_pdf.libs.commons import fitz
  2. from magic_pdf.para.commons import *
  3. if sys.version_info[0] >= 3:
  4. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  5. class DrawAnnos:
  6. """
  7. This class draws annotations on the pdf file
  8. ----------------------------------------
  9. Color Code
  10. ----------------------------------------
  11. Red: (1, 0, 0)
  12. Green: (0, 1, 0)
  13. Blue: (0, 0, 1)
  14. Yellow: (1, 1, 0) - mix of red and green
  15. Cyan: (0, 1, 1) - mix of green and blue
  16. Magenta: (1, 0, 1) - mix of red and blue
  17. White: (1, 1, 1) - red, green and blue full intensity
  18. Black: (0, 0, 0) - no color component whatsoever
  19. Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
  20. Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
  21. """
  22. def __init__(self) -> None:
  23. pass
  24. def __is_nested_list(self, lst):
  25. """
  26. This function returns True if the given list is a nested list of any degree.
  27. """
  28. if isinstance(lst, list):
  29. return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
  30. return False
  31. def __valid_rect(self, bbox):
  32. # Ensure that the rectangle is not empty or invalid
  33. if isinstance(bbox[0], list):
  34. return False # It's a nested list, hence it can't be valid rect
  35. else:
  36. return bbox[0] < bbox[2] and bbox[1] < bbox[3]
  37. def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
  38. """
  39. This function draws the nested boxes
  40. Parameters
  41. ----------
  42. page : fitz.Page
  43. page
  44. nested_bbox : list
  45. nested bbox
  46. color : tuple
  47. color, by default (0, 1, 1) # draw with cyan color for combined paragraph
  48. """
  49. if self.__is_nested_list(nested_bbox): # If it's a nested list
  50. for bbox in nested_bbox:
  51. self.__draw_nested_boxes(page, bbox, color) # Recursively call the function
  52. elif self.__valid_rect(nested_bbox): # If valid rectangle
  53. para_rect = fitz.Rect(nested_bbox)
  54. para_anno = page.add_rect_annot(para_rect)
  55. para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph
  56. para_anno.set_border(width=1)
  57. para_anno.update()
  58. def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
  59. pdf_doc = open_pdf(input_pdf_path)
  60. if pdf_dic is None:
  61. pdf_dic = {}
  62. if output_pdf_path is None:
  63. output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
  64. for page_id, page in enumerate(pdf_doc): # type: ignore
  65. page_key = f"page_{page_id}"
  66. for ele_key, ele_data in pdf_dic[page_key].items():
  67. if ele_key == "para_blocks":
  68. para_blocks = ele_data
  69. for para_block in para_blocks:
  70. if "paras" in para_block.keys():
  71. paras = para_block["paras"]
  72. for para_key, para_content in paras.items():
  73. para_bbox = para_content["para_bbox"]
  74. # print(f"para_bbox: {para_bbox}")
  75. # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
  76. if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
  77. color = (0, 1, 1)
  78. self.__draw_nested_boxes(
  79. page, para_bbox, color
  80. ) # draw with cyan color for combined paragraph
  81. else:
  82. if self.__valid_rect(para_bbox):
  83. para_rect = fitz.Rect(para_bbox)
  84. para_anno = page.add_rect_annot(para_rect)
  85. para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph
  86. para_anno.set_border(width=0.5)
  87. para_anno.update()
  88. is_para_title = para_content["is_para_title"]
  89. if is_para_title:
  90. if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
  91. color = (0, 0, 1)
  92. self.__draw_nested_boxes(
  93. page, para_content["para_bbox"], color
  94. ) # draw with cyan color for combined title
  95. else:
  96. if self.__valid_rect(para_content["para_bbox"]):
  97. para_rect = fitz.Rect(para_content["para_bbox"])
  98. if self.__valid_rect(para_content["para_bbox"]):
  99. para_anno = page.add_rect_annot(para_rect)
  100. para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title
  101. para_anno.set_border(width=0.5)
  102. para_anno.update()
  103. pdf_doc.save(output_pdf_path)
  104. pdf_doc.close()