1 year ago · 83deab214e
--- a/demo/draw_bbox.py
+++ b/demo/draw_bbox.py
@@ -1,20 +1,45 @@
 
															 from magic_pdf.libs.commons import fitz  # PyMuPDF
														
 
															+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
														
 
															+import json
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+def read_json_file(file_path):
														
 
															+    with open(file_path, 'r') as f:
														
 
															+        data = json.load(f)
														
 
															+    return data
														
 
															+
														
 
															 # PDF文件路径
														
 
															-pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
														
 
															+pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_org.pdf"
														
 
															 doc = fitz.open(pdf_path)  # Open the PDF
														
 
															 # 你的数据
														
 
															 data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
														
 
															-
														
 
															+ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
														
 
															+ocr_pdf_info = read_json_file(ocr_json_file_path)
														
 
															+pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
														
 
															+data_list = []
														
 
															+for page in pdf_info_dict.values():
														
 
															+    page_list = []
														
 
															+    blocks = page.get("preproc_blocks")
														
 
															+    for block in blocks:
														
 
															+        lines = block.get("lines")
														
 
															+        for line in lines:
														
 
															+            spans = line.get("spans")
														
 
															+            for span in spans:
														
 
															+                page_list.append(span["bbox"])
														
 
															+    data_list.append(page_list)
														
 
															 # 对每个页面进行处理
														
 
															 for i, page in enumerate(doc):
														
 
															     # 获取当前页面的数据
														
 
															-    page_data = data[i]
														
 
															+    page_data = data_list[i]
														
 
															     for img in page_data:
														
 
															-        x0, y0, x1, y1, _ = img
														
 
															+        x0, y0, x1, y1 = img
														
 
															         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
														
 
															         page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
														
 
															 # Save the PDF
														
 
															-doc.save("D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018_new.pdf")
														
 
															+doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_new.pdf")
														
--- a/magic_pdf/libs/ocr_dict_merge.py
+++ b/magic_pdf/libs/ocr_dict_merge.py
@@ -74,6 +74,7 @@ def modify_y_axis(spans: list):
 
															     current_line = [spans[0]]
														
 
															     if spans[0]["type"] in ["displayed_equation", "image", "table"]:
														
 
															         displayed_list.append(spans[0])
														
 
															+
														
 
															     line_first_y0 = spans[0]["bbox"][1]
														
 
															     line_first_y = spans[0]["bbox"][3]
														
 
															     #用于给行间公式搜索
														
@@ -89,15 +90,16 @@ def modify_y_axis(spans: list):
 
															             # 则开始新行
														
 
															             lines.append(current_line)
														
 
															             current_line = [span]
														
 
															-            line_first_y0 = spans[0]["bbox"][1]
														
 
															-            line_first_y = spans[0]["bbox"][3]
														
 
															+            line_first_y0 = span["bbox"][1]
														
 
															+            line_first_y = span["bbox"][3]
														
 
															             continue
														
 
															         # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
														
 
															         if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
														
 
															-
														
 
															-            span["bbox"][1] = line_first_y0
														
 
															-            span["bbox"][3] = line_first_y
														
 
															+            if span["bbox"][1] < line_first_y0:
														
 
															+                line_first_y0 = span["bbox"][1]
														
 
															+            if span["bbox"][3] > line_first_y:
														
 
															+                line_first_y = span["bbox"][3]
														
 
															             current_line.append(span)
														
 
															         else:
														
@@ -111,18 +113,41 @@ def modify_y_axis(spans: list):
 
															         # 添加最后一行
														
 
															     if current_line:
														
 
															         lines.append(current_line)
														
 
															+        if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]:
														
 
															+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
														
 
															     for line in text_inline_lines:
														
 
															         # 按照x0坐标排序
														
 
															-        line.sort(key=lambda span: span[0]['bbox'][0])
														
 
															-
														
 
															+        current_line = line[0]
														
 
															+        current_line.sort(key=lambda span: span['bbox'][0])
														
 
															+    #调整每一个文字行内bbox统一
														
 
															+    for line in text_inline_lines:
														
 
															+        current_line, (line_first_y0, line_first_y) = line
														
 
															+        for span in current_line:
														
 
															+            span["bbox"][1] = line_first_y0
														
 
															+            span["bbox"][3] = line_first_y
														
 
															     #错误行间公式转行内公式
														
 
															+    j = 0
														
 
															     for i in range(len(displayed_list)):
														
 
															         span = displayed_list[i]
														
 
															+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
														
 
															+        while j < len(text_inline_lines):
														
 
															+            text_line = text_inline_lines[j]
														
 
															+            y0, y1 = text_line[1]
														
 
															+            if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
														
 
															+                span["bbox"][1] = y0
														
 
															+                span["bbox"][3] = y1
														
 
															+                if span["type"] == "displayed_equation":
														
 
															+                    span["type"] = "inline_equation"
														
 
															+                break
														
 
															+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
														
 
															+                break
														
 
															+            else:
														
 
															+                j += 1
														
 
															-
														
 
															+    return spans
														
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -1,6 +1,6 @@
 
															 from loguru import logger
														
 
															-from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
														
 
															+from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis
														
 
															 def construct_page_component(page_id, blocks):
														
@@ -68,7 +68,7 @@ def parse_pdf_by_ocr(
 
															         spans = remove_overlaps_min_spans(spans)
														
 
															         # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
														
 
															-
														
 
															+        #spans = modify_y_axis(spans)
														
 
															         # 将spans合并成line(从上到下,从左到右)
														
 
															         lines = merge_spans_to_line(spans)