liukaiwen 1 жил өмнө
parent
commit
f9f36c10cf

+ 3 - 3
demo/draw_bbox.py

@@ -15,12 +15,12 @@ def read_json_file(file_path):
 
 
 # PDF文件路径
-pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_org.pdf"
+pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_1_org.pdf"
 
 doc = fitz.open(pdf_path)  # Open the PDF
 # 你的数据
 data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
-ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
+ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_1.json"
 ocr_pdf_info = read_json_file(ocr_json_file_path)
 
 pth = Path(ocr_json_file_path)
@@ -56,4 +56,4 @@ for i, page in enumerate(doc):
         page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
 
 # Save the PDF
-doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_new1.pdf")
+doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_1_new2.pdf")

+ 1 - 1
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans):
     for span1 in spans.copy():
         for span2 in spans.copy():
             if span1 != span2:
-                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
+                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.5)
                 if overlap_box is not None:
                     bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
                     if bbox_to_remove is not None: