|
|
@@ -1,21 +1,66 @@
|
|
|
from magic_pdf.libs.commons import fitz # PyMuPDF
|
|
|
|
|
|
-# PDF文件路径
|
|
|
-pdf_path = r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json.pdf"
|
|
|
-
|
|
|
-doc = fitz.open(pdf_path) # Open the PDF
|
|
|
-# 你的数据
|
|
|
-data = [[(294.7569528415961, 776.8430953398889, 300.8827085852479, 786.922616502779), (460.1523579201934, 776.8430953398889, 509.51874244256345, 787.2825994014537)], [(294.03627569528413, 779.7229585292861, 301.24304715840384, 788.3625480974777), (85.76058041112454, 781.882855921334, 156.74727932285367, 789.8024796921762)], [(293.6759371221282, 779.7229585292861, 301.60338573155985, 788.7225309961523), (459.43168077388145, 779.7229585292861, 508.7980652962515, 789.8024796921762)], [(295.8379685610641, 780.0829414279607, 301.24304715840384, 788.0025651988029), (85.76058041112454, 781.5228730226593, 156.74727932285367, 790.1624625908509)], [(294.03627569528413, 779.7229585292861, 301.60338573155985, 789.0825138948269), (459.79201934703747, 779.7229585292861, 508.4377267230955, 789.4424967935015)], [(86.4812575574365, 781.882855921334, 156.0266021765417, 789.8024796921762)], [(294.39661426844015, 779.7229585292861, 301.24304715840384, 788.3625480974777), (459.43168077388145, 779.7229585292861, 508.7980652962515, 789.4424967935015)], [(294.03627569528413, 779.7229585292861, 301.24304715840384, 788.3625480974777), (85.76058041112454, 781.5228730226593, 156.74727932285367, 789.8024796921762)], [(294.39661426844015, 779.7229585292861, 300.8827085852479, 788.3625480974777)]]
|
|
|
-
|
|
|
-# 对每个页面进行处理
|
|
|
-for i, page in enumerate(doc):
|
|
|
- # 获取当前页面的数据
|
|
|
- page_data = data[i]
|
|
|
- for img in page_data:
|
|
|
- # x0, y0, x1, y1, _ = img
|
|
|
- x0, y0, x1, y1 = img
|
|
|
+def draw_bbox(i, bbox_list, page, rgb_config):
|
|
|
+ new_rgb = []
|
|
|
+ for item in rgb_config:
|
|
|
+ item = float(item) / 255
|
|
|
+ new_rgb.append(item)
|
|
|
+ page_data = bbox_list[i]
|
|
|
+ for bbox in page_data:
|
|
|
+ x0, y0, x1, y1 = bbox
|
|
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
|
|
- page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True) # Draw the rectangle
|
|
|
+ page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
|
|
|
|
|
|
-# Save the PDF
|
|
|
-doc.save(r"D:\project\20231108code-clean\magic_pdf\tmp\unittest\download-pdfs\ocr_1.json_new.pdf")
|
|
|
+
|
|
|
+def draw_layout_bbox(pdf_info_dict, input_path, out_path):
|
|
|
+ layout_bbox_list = []
|
|
|
+ for page in pdf_info_dict.values():
|
|
|
+ page_list = []
|
|
|
+ for layout in page['layout_bboxes']:
|
|
|
+ page_list.append(layout['layout_bbox'])
|
|
|
+ layout_bbox_list.append(page_list)
|
|
|
+
|
|
|
+ doc = fitz.open(input_path)
|
|
|
+ for i, page in enumerate(doc):
|
|
|
+ # 获取当前页面的数据
|
|
|
+ page_data = layout_bbox_list[i]
|
|
|
+ for j, bbox in enumerate(page_data):
|
|
|
+ x0, y0, x1, y1 = bbox
|
|
|
+ rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
|
|
+ page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=0.5, overlay=True) # Draw the rectangle
|
|
|
+ page.insert_text((x0, y0), str(j + 1), fontsize=10, color=(1, 0, 0)) # Insert the index at the top left corner of the rectangle
|
|
|
+ # Save the PDF
|
|
|
+ doc.save(f"{out_path}/layout.pdf")
|
|
|
+
|
|
|
+def draw_text_bbox(pdf_info_dict, input_path, out_path):
|
|
|
+ text_list = []
|
|
|
+ inline_equation_list = []
|
|
|
+ displayed_equation_list = []
|
|
|
+ for page in pdf_info_dict.values():
|
|
|
+ page_text_list = []
|
|
|
+ page_inline_equation_list = []
|
|
|
+ page_displayed_equation_list = []
|
|
|
+ for block in page['preproc_blocks']:
|
|
|
+ for line in block['lines']:
|
|
|
+ for span in line['spans']:
|
|
|
+ if span['type'] == 'text':
|
|
|
+ page_text_list.append(span['bbox'])
|
|
|
+ elif span['type'] == 'inline_equation':
|
|
|
+ page_inline_equation_list.append(span['bbox'])
|
|
|
+ elif span['type'] == 'displayed_equation':
|
|
|
+ page_displayed_equation_list.append(span['bbox'])
|
|
|
+ text_list.append(page_text_list)
|
|
|
+ inline_equation_list.append(page_inline_equation_list)
|
|
|
+ displayed_equation_list.append(page_displayed_equation_list)
|
|
|
+
|
|
|
+ doc = fitz.open(input_path)
|
|
|
+ for i, page in enumerate(doc):
|
|
|
+ # 获取当前页面的数据
|
|
|
+ draw_bbox(i, text_list, page, [255, 0, 0])
|
|
|
+
|
|
|
+ draw_bbox(i, inline_equation_list, page, [0, 255, 0])
|
|
|
+
|
|
|
+ draw_bbox(i, displayed_equation_list, page, [0, 0, 255])
|
|
|
+
|
|
|
+ # Save the PDF
|
|
|
+ doc.save(f"{out_path}/text.pdf")
|