|
|
@@ -1,59 +1,66 @@
|
|
|
-from pathlib import Path
|
|
|
-
|
|
|
-from magic_pdf.libs.commons import fitz, join_path # PyMuPDF
|
|
|
-from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
|
|
-import json
|
|
|
-import os
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-def read_json_file(file_path):
|
|
|
- with open(file_path, 'r') as f:
|
|
|
- data = json.load(f)
|
|
|
- return data
|
|
|
-
|
|
|
-
|
|
|
-# PDF文件路径
|
|
|
-pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_2_org.pdf"
|
|
|
-
|
|
|
-doc = fitz.open(pdf_path) # Open the PDF
|
|
|
-# 你的数据
|
|
|
-data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
|
|
|
-ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_2.json"
|
|
|
-ocr_pdf_info = read_json_file(ocr_json_file_path)
|
|
|
-
|
|
|
-pth = Path(ocr_json_file_path)
|
|
|
-book_name = pth.name
|
|
|
-save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
|
|
|
-save_path = join_path(save_tmp_path, "md")
|
|
|
-
|
|
|
-pdf_info_dict = parse_pdf_by_ocr(
|
|
|
- pdf_path,
|
|
|
- None,
|
|
|
- ocr_pdf_info,
|
|
|
- save_path,
|
|
|
- book_name,
|
|
|
- debug_mode=True)
|
|
|
-data_list = []
|
|
|
-for page in pdf_info_dict.values():
|
|
|
- page_list = []
|
|
|
- blocks = page.get("preproc_blocks")
|
|
|
- for block in blocks:
|
|
|
- lines = block.get("lines")
|
|
|
- for line in lines:
|
|
|
- spans = line.get("spans")
|
|
|
- for span in spans:
|
|
|
- page_list.append(span["bbox"])
|
|
|
- data_list.append(page_list)
|
|
|
-# 对每个页面进行处理
|
|
|
-for i, page in enumerate(doc):
|
|
|
- # 获取当前页面的数据
|
|
|
- page_data = data_list[i]
|
|
|
- for img in page_data:
|
|
|
- x0, y0, x1, y1 = img
|
|
|
+from magic_pdf.libs.commons import fitz # PyMuPDF
|
|
|
+
|
|
|
+def draw_bbox(i, bbox_list, page, rgb_config):
|
|
|
+ new_rgb = []
|
|
|
+ for item in rgb_config:
|
|
|
+ item = float(item) / 255
|
|
|
+ new_rgb.append(item)
|
|
|
+ page_data = bbox_list[i]
|
|
|
+ for bbox in page_data:
|
|
|
+ x0, y0, x1, y1 = bbox
|
|
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
|
|
- page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True) # Draw the rectangle
|
|
|
+ page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
|
|
|
+
|
|
|
+
|
|
|
+def draw_layout_bbox(pdf_info_dict, input_path, out_path):
|
|
|
+ layout_bbox_list = []
|
|
|
+ for page in pdf_info_dict.values():
|
|
|
+ page_list = []
|
|
|
+ for layout in page['layout_bboxes']:
|
|
|
+ page_list.append(layout['layout_bbox'])
|
|
|
+ layout_bbox_list.append(page_list)
|
|
|
+
|
|
|
+ doc = fitz.open(input_path)
|
|
|
+ for i, page in enumerate(doc):
|
|
|
+ # 获取当前页面的数据
|
|
|
+ page_data = layout_bbox_list[i]
|
|
|
+ for j, bbox in enumerate(page_data):
|
|
|
+ x0, y0, x1, y1 = bbox
|
|
|
+ rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
|
|
+ page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=0.5, overlay=True) # Draw the rectangle
|
|
|
+ page.insert_text((x0, y0), str(j + 1), fontsize=10, color=(1, 0, 0)) # Insert the index at the top left corner of the rectangle
|
|
|
+ # Save the PDF
|
|
|
+ doc.save(f"{out_path}/layout.pdf")
|
|
|
+
|
|
|
+def draw_text_bbox(pdf_info_dict, input_path, out_path):
|
|
|
+ text_list = []
|
|
|
+ inline_equation_list = []
|
|
|
+ displayed_equation_list = []
|
|
|
+ for page in pdf_info_dict.values():
|
|
|
+ page_text_list = []
|
|
|
+ page_inline_equation_list = []
|
|
|
+ page_displayed_equation_list = []
|
|
|
+ for block in page['preproc_blocks']:
|
|
|
+ for line in block['lines']:
|
|
|
+ for span in line['spans']:
|
|
|
+ if span['type'] == 'text':
|
|
|
+ page_text_list.append(span['bbox'])
|
|
|
+ elif span['type'] == 'inline_equation':
|
|
|
+ page_inline_equation_list.append(span['bbox'])
|
|
|
+ elif span['type'] == 'displayed_equation':
|
|
|
+ page_displayed_equation_list.append(span['bbox'])
|
|
|
+ text_list.append(page_text_list)
|
|
|
+ inline_equation_list.append(page_inline_equation_list)
|
|
|
+ displayed_equation_list.append(page_displayed_equation_list)
|
|
|
+
|
|
|
+ doc = fitz.open(input_path)
|
|
|
+ for i, page in enumerate(doc):
|
|
|
+ # 获取当前页面的数据
|
|
|
+ draw_bbox(i, text_list, page, [255, 0, 0])
|
|
|
+
|
|
|
+ draw_bbox(i, inline_equation_list, page, [0, 255, 0])
|
|
|
+
|
|
|
+ draw_bbox(i, displayed_equation_list, page, [0, 0, 255])
|
|
|
|
|
|
-# Save the PDF
|
|
|
-doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_2_new1.pdf")
|
|
|
+ # Save the PDF
|
|
|
+ doc.save(f"{out_path}/text.pdf")
|