| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- from magic_pdf.libs.commons import fitz
- from magic_pdf.para.commons import *
- if sys.version_info[0] >= 3:
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
- class DrawAnnos:
- """
- This class draws annotations on the pdf file
- ----------------------------------------
- Color Code
- ----------------------------------------
- Red: (1, 0, 0)
- Green: (0, 1, 0)
- Blue: (0, 0, 1)
- Yellow: (1, 1, 0) - mix of red and green
- Cyan: (0, 1, 1) - mix of green and blue
- Magenta: (1, 0, 1) - mix of red and blue
- White: (1, 1, 1) - red, green and blue full intensity
- Black: (0, 0, 0) - no color component whatsoever
- Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
- Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
- """
- def __init__(self) -> None:
- pass
- def __is_nested_list(self, lst):
- """
- This function returns True if the given list is a nested list of any degree.
- """
- if isinstance(lst, list):
- return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
- return False
- def __valid_rect(self, bbox):
- # Ensure that the rectangle is not empty or invalid
- if isinstance(bbox[0], list):
- return False # It's a nested list, hence it can't be valid rect
- else:
- return bbox[0] < bbox[2] and bbox[1] < bbox[3]
- def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
- """
- This function draws the nested boxes
- Parameters
- ----------
- page : fitz.Page
- page
- nested_bbox : list
- nested bbox
- color : tuple
- color, by default (0, 1, 1) # draw with cyan color for combined paragraph
- """
- if self.__is_nested_list(nested_bbox): # If it's a nested list
- for bbox in nested_bbox:
- self.__draw_nested_boxes(page, bbox, color) # Recursively call the function
- elif self.__valid_rect(nested_bbox): # If valid rectangle
- para_rect = fitz.Rect(nested_bbox)
- para_anno = page.add_rect_annot(para_rect)
- para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph
- para_anno.set_border(width=1)
- para_anno.update()
- def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
- pdf_doc = open_pdf(input_pdf_path)
- if pdf_dic is None:
- pdf_dic = {}
- if output_pdf_path is None:
- output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
- for page_id, page in enumerate(pdf_doc): # type: ignore
- page_key = f"page_{page_id}"
- for ele_key, ele_data in pdf_dic[page_key].items():
- if ele_key == "para_blocks":
- para_blocks = ele_data
- for para_block in para_blocks:
- if "paras" in para_block.keys():
- paras = para_block["paras"]
- for para_key, para_content in paras.items():
- para_bbox = para_content["para_bbox"]
- # print(f"para_bbox: {para_bbox}")
- # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
- if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
- color = (0, 1, 1)
- self.__draw_nested_boxes(
- page, para_bbox, color
- ) # draw with cyan color for combined paragraph
- else:
- if self.__valid_rect(para_bbox):
- para_rect = fitz.Rect(para_bbox)
- para_anno = page.add_rect_annot(para_rect)
- para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph
- para_anno.set_border(width=0.5)
- para_anno.update()
- is_para_title = para_content["is_para_title"]
- if is_para_title:
- if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
- color = (0, 0, 1)
- self.__draw_nested_boxes(
- page, para_content["para_bbox"], color
- ) # draw with cyan color for combined title
- else:
- if self.__valid_rect(para_content["para_bbox"]):
- para_rect = fitz.Rect(para_content["para_bbox"])
- if self.__valid_rect(para_content["para_bbox"]):
- para_anno = page.add_rect_annot(para_rect)
- para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title
- para_anno.set_border(width=0.5)
- para_anno.update()
- pdf_doc.save(output_pdf_path)
- pdf_doc.close()
|