1 年之前 · 94a7ba3d23
--- a/demo/draw_bbox.py
+++ b/demo/draw_bbox.py
@@ -1,7 +1,9 @@
 
				-from magic_pdf.libs.commons import fitz  # PyMuPDF
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from magic_pdf.libs.commons import fitz, join_path  # PyMuPDF
			
 
				 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
			
 
				 import json
			
 
				-
			
 
				+import os
			
 
				 
			
 
				 
			
 
				 
			
@@ -20,7 +22,19 @@ doc = fitz.open(pdf_path)  # Open the PDF
 
				 data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
			
 
				 ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
			
 
				 ocr_pdf_info = read_json_file(ocr_json_file_path)
			
 
				-pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
			
 
				+
			
 
				+pth = Path(ocr_json_file_path)
			
 
				+book_name = pth.name
			
 
				+save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
			
 
				+save_path = join_path(save_tmp_path, "md")
			
 
				+
			
 
				+pdf_info_dict = parse_pdf_by_ocr(
			
 
				+            pdf_path,
			
 
				+            None,
			
 
				+            ocr_pdf_info,
			
 
				+            save_path,
			
 
				+            book_name,
			
 
				+            debug_mode=True)
			
 
				 data_list = []
			
 
				 for page in pdf_info_dict.values():
			
 
				     page_list = []
			
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -4,7 +4,6 @@ import time
 
				 
			
 
				 from loguru import logger
			
 
				 
			
 
				-from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans, modify_y_axis
			
 
				 from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
			
 
				 from magic_pdf.libs.coordinate_transform import get_scale_ratio
			
 
				 from magic_pdf.libs.safe_filename import sanitize_filename
			
@@ -14,7 +13,7 @@ from magic_pdf.pre_proc.detect_header import parse_headers
 
				 from magic_pdf.pre_proc.detect_page_number import parse_pageNos
			
 
				 from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
			
 
				 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
			
 
				-from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout
			
 
				+from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout, modify_y_axis
			
 
				 from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
			
 
				 
			
 
				 
			
@@ -150,7 +149,7 @@ def parse_pdf_by_ocr(
 
				         spans = remove_overlaps_min_spans(spans)
			
 
				 
			
 
				         # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
			
 
				-        # spans = modify_y_axis(spans)
			
 
				+        spans = modify_y_axis(spans)
			
 
				 
			
 
				         # 删除remove_span_block_bboxes中的bbox
			
 
				         spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
			
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -113,15 +113,19 @@ def modify_y_axis(spans: list):
 
				     #用于给行间公式搜索
			
 
				     text_inline_lines = []
			
 
				     for span in spans[1:]:
			
 
				+        if span.get("content","") == "78.":
			
 
				+            print("debug")
			
 
				         # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
			
 
				         # image和table类型，同上
			
 
				         if span['type'] in ["displayed_equation", "image", "table"] or any(
			
 
				                 s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
			
 
				             #传入
			
 
				-            if spans[0]["type"] in ["displayed_equation", "image", "table"]:
			
 
				+            if span["type"] in ["displayed_equation", "image", "table"]:
			
 
				                 displayed_list.append(span)
			
 
				             # 则开始新行
			
 
				             lines.append(current_line)
			
 
				+            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
			
 
				+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
			
 
				             current_line = [span]
			
 
				             line_first_y0 = span["bbox"][1]
			
 
				             line_first_y = span["bbox"][3]
			
@@ -140,15 +144,14 @@ def modify_y_axis(spans: list):
 
				             lines.append(current_line)
			
 
				             text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
			
 
				             current_line = [span]
			
 
				-            line_first_y0 = spans[0]["bbox"][1]
			
 
				-            line_first_y = spans[0]["bbox"][3]
			
 
				+            line_first_y0 = span["bbox"][1]
			
 
				+            line_first_y = span["bbox"][3]
			
 
				 
			
 
				         # 添加最后一行
			
 
				     if current_line:
			
 
				         lines.append(current_line)
			
 
				-        if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]:
			
 
				+        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
			
 
				             text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
			
 
				-
			
 
				     for line in text_inline_lines:
			
 
				         # 按照x0坐标排序
			
 
				         current_line = line[0]
			
@@ -164,14 +167,17 @@ def modify_y_axis(spans: list):
 
				     #错误行间公式转行内公式
			
 
				     j = 0
			
 
				     for i in range(len(displayed_list)):
			
 
				+        if i == 8:
			
 
				+            print("debug")
			
 
				         span = displayed_list[i]
			
 
				         span_y0, span_y = span["bbox"][1], span["bbox"][3]
			
 
				+
			
 
				         while j < len(text_inline_lines):
			
 
				             text_line = text_inline_lines[j]
			
 
				             y0, y1 = text_line[1]
			
 
				-            if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
			
 
				+            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
			
 
				                 span["bbox"][1] = y0
			
 
				-                span["bbox"][3] = y1
			
 
				+                # span["bbox"][3] = y1
			
 
				                 if span["type"] == "displayed_equation":
			
 
				                     span["type"] = "inline_equation"
			
 
				                 break