liukaiwen 1 year ago
parent
commit
83deab214e
3 changed files with 65 additions and 15 deletions
  1. 30 5
      demo/draw_bbox.py
  2. 33 8
      magic_pdf/libs/ocr_dict_merge.py
  3. 2 2
      magic_pdf/pdf_parse_by_ocr.py

+ 30 - 5
demo/draw_bbox.py

@@ -1,20 +1,45 @@
 from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+import json
+
+
+
+
+
+def read_json_file(file_path):
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+
 
 
 # PDF文件路径
 # PDF文件路径
-pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
+pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_org.pdf"
 
 
 doc = fitz.open(pdf_path)  # Open the PDF
 doc = fitz.open(pdf_path)  # Open the PDF
 # 你的数据
 # 你的数据
 data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
 data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
-
+ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
+ocr_pdf_info = read_json_file(ocr_json_file_path)
+pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
+data_list = []
+for page in pdf_info_dict.values():
+    page_list = []
+    blocks = page.get("preproc_blocks")
+    for block in blocks:
+        lines = block.get("lines")
+        for line in lines:
+            spans = line.get("spans")
+            for span in spans:
+                page_list.append(span["bbox"])
+    data_list.append(page_list)
 # 对每个页面进行处理
 # 对每个页面进行处理
 for i, page in enumerate(doc):
 for i, page in enumerate(doc):
     # 获取当前页面的数据
     # 获取当前页面的数据
-    page_data = data[i]
+    page_data = data_list[i]
     for img in page_data:
     for img in page_data:
-        x0, y0, x1, y1, _ = img
+        x0, y0, x1, y1 = img
         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
         rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
         page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
         page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
 
 
 # Save the PDF
 # Save the PDF
-doc.save("D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018_new.pdf")
+doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_new.pdf")

+ 33 - 8
magic_pdf/libs/ocr_dict_merge.py

@@ -74,6 +74,7 @@ def modify_y_axis(spans: list):
     current_line = [spans[0]]
     current_line = [spans[0]]
     if spans[0]["type"] in ["displayed_equation", "image", "table"]:
     if spans[0]["type"] in ["displayed_equation", "image", "table"]:
         displayed_list.append(spans[0])
         displayed_list.append(spans[0])
+
     line_first_y0 = spans[0]["bbox"][1]
     line_first_y0 = spans[0]["bbox"][1]
     line_first_y = spans[0]["bbox"][3]
     line_first_y = spans[0]["bbox"][3]
     #用于给行间公式搜索
     #用于给行间公式搜索
@@ -89,15 +90,16 @@ def modify_y_axis(spans: list):
             # 则开始新行
             # 则开始新行
             lines.append(current_line)
             lines.append(current_line)
             current_line = [span]
             current_line = [span]
-            line_first_y0 = spans[0]["bbox"][1]
-            line_first_y = spans[0]["bbox"][3]
+            line_first_y0 = span["bbox"][1]
+            line_first_y = span["bbox"][3]
             continue
             continue
 
 
         # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
         # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
         if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
         if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-
-            span["bbox"][1] = line_first_y0
-            span["bbox"][3] = line_first_y
+            if span["bbox"][1] < line_first_y0:
+                line_first_y0 = span["bbox"][1]
+            if span["bbox"][3] > line_first_y:
+                line_first_y = span["bbox"][3]
             current_line.append(span)
             current_line.append(span)
 
 
         else:
         else:
@@ -111,18 +113,41 @@ def modify_y_axis(spans: list):
         # 添加最后一行
         # 添加最后一行
     if current_line:
     if current_line:
         lines.append(current_line)
         lines.append(current_line)
+        if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]:
+            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
 
 
     for line in text_inline_lines:
     for line in text_inline_lines:
         # 按照x0坐标排序
         # 按照x0坐标排序
-        line.sort(key=lambda span: span[0]['bbox'][0])
-
+        current_line = line[0]
+        current_line.sort(key=lambda span: span['bbox'][0])
 
 
 
 
+    #调整每一个文字行内bbox统一
+    for line in text_inline_lines:
+        current_line, (line_first_y0, line_first_y) = line
+        for span in current_line:
+            span["bbox"][1] = line_first_y0
+            span["bbox"][3] = line_first_y
     #错误行间公式转行内公式
     #错误行间公式转行内公式
+    j = 0
     for i in range(len(displayed_list)):
     for i in range(len(displayed_list)):
         span = displayed_list[i]
         span = displayed_list[i]
+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+        while j < len(text_inline_lines):
+            text_line = text_inline_lines[j]
+            y0, y1 = text_line[1]
+            if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                span["bbox"][1] = y0
+                span["bbox"][3] = y1
+                if span["type"] == "displayed_equation":
+                    span["type"] = "inline_equation"
+                break
+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
+                break
+            else:
+                j += 1
 
 
-
+    return spans
 
 
 
 
 
 

+ 2 - 2
magic_pdf/pdf_parse_by_ocr.py

@@ -1,6 +1,6 @@
 from loguru import logger
 from loguru import logger
 
 
-from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
+from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis
 
 
 
 
 def construct_page_component(page_id, blocks):
 def construct_page_component(page_id, blocks):
@@ -68,7 +68,7 @@ def parse_pdf_by_ocr(
         spans = remove_overlaps_min_spans(spans)
         spans = remove_overlaps_min_spans(spans)
 
 
         # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
         # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
-
+        #spans = modify_y_axis(spans)
 
 
         # 将spans合并成line(从上到下,从左到右)
         # 将spans合并成line(从上到下,从左到右)
         lines = merge_spans_to_line(spans)
         lines = merge_spans_to_line(spans)