1 year ago · 26c2378271
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,3 +1,6 @@
 
															+from magic_pdf.libs.ocr_content_type import ContentType
														
 
															+
														
 
															+
														
 
															 def mk_nlp_markdown(pdf_info_dict: dict):
														
 
															     markdown = []
														
@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
 
															                     if not span.get('content'):
														
 
															                         continue
														
 
															                     content = span['content'].replace('$', '\$')  # 转义$
														
 
															-                    if span['type'] == 'inline_equation':
														
 
															+                    if span['type'] == ContentType.InlineEquation:
														
 
															                         content = f"${content}$"
														
 
															-                    elif span['type'] == 'displayed_equation':
														
 
															+                    elif span['type'] == ContentType.InterlineEquation:
														
 
															                         content = f"$$\n{content}\n$$"
														
 
															                     line_text += content + ' '
														
 
															                 # 在行末添加两个空格以强制换行
														
@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
 
															                             content = f"![]({span['image_path']})"
														
 
															                     else:
														
 
															                         content = span['content'].replace('$', '\$')  # 转义$
														
 
															-                        if span['type'] == 'inline_equation':
														
 
															+                        if span['type'] == ContentType.InlineEquation:
														
 
															                             content = f"${content}$"
														
 
															-                        elif span['type'] == 'displayed_equation':
														
 
															+                        elif span['type'] == ContentType.InterlineEquation:
														
 
															                             content = f"$$\n{content}\n$$"
														
 
															                     line_text += content + ' '
														
 
															                 # 在行末添加两个空格以强制换行
														
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -1,4 +1,6 @@
 
															 from magic_pdf.libs.commons import fitz  # PyMuPDF
														
 
															+from magic_pdf.libs.ocr_content_type import ContentType
														
 
															+
														
 
															 def draw_bbox_without_number(i, bbox_list, page, rgb_config):
														
 
															     new_rgb = []
														
@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
 
															 def draw_text_bbox(pdf_info_dict, input_path, out_path):
														
 
															     text_list = []
														
 
															     inline_equation_list = []
														
 
															-    displayed_equation_list = []
														
 
															+    interline_equation_list = []
														
 
															     for page in pdf_info_dict.values():
														
 
															         page_text_list = []
														
 
															         page_inline_equation_list = []
														
 
															-        page_displayed_equation_list = []
														
 
															+        page_interline_equation_list = []
														
 
															         for block in page['preproc_blocks']:
														
 
															             for line in block['lines']:
														
 
															                 for span in line['spans']:
														
 
															-                    if span['type'] == 'text':
														
 
															+                    if span['type'] == ContentType.Text:
														
 
															                         page_text_list.append(span['bbox'])
														
 
															-                    elif span['type'] == 'inline_equation':
														
 
															+                    elif span['type'] == ContentType.InlineEquation:
														
 
															                         page_inline_equation_list.append(span['bbox'])
														
 
															-                    elif span['type'] == 'displayed_equation':
														
 
															-                        page_displayed_equation_list.append(span['bbox'])
														
 
															+                    elif span['type'] == ContentType.InterlineEquation:
														
 
															+                        page_interline_equation_list.append(span['bbox'])
														
 
															         text_list.append(page_text_list)
														
 
															         inline_equation_list.append(page_inline_equation_list)
														
 
															-        displayed_equation_list.append(page_displayed_equation_list)
														
 
															+        interline_equation_list.append(page_interline_equation_list)
														
 
															     doc = fitz.open(input_path)
														
 
															     for i, page in enumerate(doc):
														
 
															         # 获取当前页面的数据
														
 
															         draw_bbox_without_number(i, text_list, page, [255, 0, 0])
														
 
															         draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
														
 
															-        draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255])
														
 
															+        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
														
 
															     # Save the PDF
														
 
															     doc.save(f"{out_path}/text.pdf")
														
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
@@ -0,0 +1,7 @@
 
															+class ContentType:
														
 
															+    Image = "image"
														
 
															+    Table = "table"
														
 
															+    Text = "text"
														
 
															+    InlineEquation = "inline_equation"
														
 
															+    InterlineEquation = "interline_equation"
														
 
															+
														
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
 
															     get_docx_model_output,
														
 
															 )
														
 
															 from magic_pdf.libs.coordinate_transform import get_scale_ratio
														
 
															+from magic_pdf.libs.ocr_content_type import ContentType
														
 
															 from magic_pdf.libs.safe_filename import sanitize_filename
														
 
															 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
														
 
															 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
														
@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
 
															         'tables': tables,
														
 
															         'interline_equations': interline_equations,
														
 
															         'inline_equations': inline_equations,
														
 
															-        'dropped_text_block': dropped_text_block,
														
 
															-        'dropped_image_block': dropped_image_block,
														
 
															-        'dropped_table_block': dropped_table_block,
														
 
															-        'dropped_bboxes': need_remove_spans_bboxes_dict,
														
 
															+        'droped_text_block': dropped_text_block,
														
 
															+        'droped_image_block': dropped_image_block,
														
 
															+        'droped_table_block': dropped_table_block,
														
 
															+        'droped_bboxes': need_remove_spans_bboxes_dict,
														
 
															     }
														
 
															     return return_dict
														
@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
 
															                 #  1: 'image', # 图片
														
 
															                 #  7: 'table',       # 表格
														
 
															                 #  13: 'inline_equation',     # 行内公式
														
 
															-                #  14: 'displayed_equation',      # 行间公式
														
 
															+                #  14: 'interline_equation',      # 行间公式
														
 
															                 #  15: 'text',      # ocr识别文本
														
 
															                 """layout信息"""
														
 
															                 #  11: 'full column',   # 单栏
														
@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
 
															                     "bbox": bbox,
														
 
															                 }
														
 
															                 if category_id == 1:
														
 
															-                    span["type"] = "image"
														
 
															+                    span["type"] = ContentType.Image
														
 
															                 elif category_id == 7:
														
 
															-                    span["type"] = "table"
														
 
															+                    span["type"] = ContentType.Table
														
 
															                 elif category_id == 13:
														
 
															                     span["content"] = layout_det["latex"]
														
 
															-                    span["type"] = "inline_equation"
														
 
															+                    span["type"] = ContentType.InlineEquation
														
 
															                 elif category_id == 14:
														
 
															                     span["content"] = layout_det["latex"]
														
 
															-                    span["type"] = "displayed_equation"
														
 
															+                    span["type"] = ContentType.InterlineEquation
														
 
															                 elif category_id == 15:
														
 
															                     span["content"] = layout_det["text"]
														
 
															-                    span["type"] = "text"
														
 
															+                    span["type"] = ContentType.Text
														
 
															                 # print(span)
														
 
															                 spans.append(span)
														
 
															             else:
														
@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
 
															         # bbox去除粘连
														
 
															         spans = remove_overlap_between_bbox(spans)
														
 
															-        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
														
 
															+        # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
														
 
															         spans = adjust_bbox_for_standalone_block(spans)
														
--- a/magic_pdf/pre_proc/ocr_cut_image.py
+++ b/magic_pdf/pre_proc/ocr_cut_image.py
@@ -1,4 +1,5 @@
 
															 from magic_pdf.libs.commons import join_path
														
 
															+from magic_pdf.libs.ocr_content_type import ContentType
														
 
															 from magic_pdf.libs.pdf_image_tools import cut_image
														
@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
 
															     for span in spans:
														
 
															         span_type = span['type']
														
 
															-        if span_type == 'image':
														
 
															+        if span_type == ContentType.Image:
														
 
															             span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
														
 
															-        elif span_type == 'table':
														
 
															+        elif span_type == ContentType.Table:
														
 
															             span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
														
 
															     return spans
														
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -2,6 +2,7 @@ from loguru import logger
 
															 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
														
 
															     calculate_overlap_area_in_bbox1_area_ratio
														
 
															+from magic_pdf.libs.ocr_content_type import ContentType
														
 
															 # 将每一个line中的span从左到右排序
														
@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
 
															     lines = []
														
 
															     current_line = [spans[0]]
														
 
															     for span in spans[1:]:
														
 
															-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
														
 
															+        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
														
 
															         # image和table类型，同上
														
 
															-        if span['type'] in ["displayed_equation", "image", "table"] or any(
														
 
															-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
														
 
															+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
														
 
															+                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
														
 
															             # 则开始新行
														
 
															             lines.append(current_line)
														
 
															             current_line = [span]
														
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -2,6 +2,7 @@ from loguru import logger
 
															 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
														
 
															     __is_overlaps_y_exceeds_threshold
														
 
															+from magic_pdf.libs.ocr_content_type import ContentType
														
 
															 def remove_overlaps_min_spans(spans):
														
@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
 
															         for span in need_remove_spans:
														
 
															             spans.remove(span)
														
 
															             span['tag'] = drop_tag
														
 
															-            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
														
 
															+            if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
														
 
															                 dropped_text_block.append(span)
														
 
															-            elif span['type'] == 'image':
														
 
															+            elif span['type'] == ContentType.Image:
														
 
															                 dropped_image_block.append(span)
														
 
															-            elif span['type'] == 'table':
														
 
															+            elif span['type'] == ContentType.Table:
														
 
															                 dropped_table_block.append(span)
														
 
															     return spans, dropped_text_block, dropped_image_block, dropped_table_block
														
 
															 def adjust_bbox_for_standalone_block(spans):
														
 
															-    # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
														
 
															+    # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
														
 
															     for sb_span in spans:
														
 
															-        if sb_span['type'] in ["displayed_equation", "image", "table"]:
														
 
															+        if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
														
 
															             for text_span in spans:
														
 
															-                if text_span['type'] in ['text', 'inline_equation']:
														
 
															+                if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
														
 
															                     # 判断span2的纵向高度是否被span所覆盖
														
 
															                     if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
														
 
															                         # 判断span2是否在span左边
														
@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
															     lines = []
														
 
															     current_line = [spans[0]]
														
 
															-    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
														
 
															+    if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
														
 
															         displayed_list.append(spans[0])
														
 
															     line_first_y0 = spans[0]["bbox"][1]
														
@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
															     for span in spans[1:]:
														
 
															         # if span.get("content","") == "78.":
														
 
															         #     print("debug")
														
 
															-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
														
 
															+        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
														
 
															         # image和table类型，同上
														
 
															-        if span['type'] in ["displayed_equation", "image", "table"] or any(
														
 
															-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
														
 
															+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
														
 
															+                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
														
 
															             # 传入
														
 
															-            if span["type"] in ["displayed_equation", "image", "table"]:
														
 
															+            if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
														
 
															                 displayed_list.append(span)
														
 
															             # 则开始新行
														
 
															             lines.append(current_line)
														
 
															-            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
														
 
															+            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
														
 
															                 text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
														
 
															             current_line = [span]
														
 
															             line_first_y0 = span["bbox"][1]
														
@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
															         # 添加最后一行
														
 
															     if current_line:
														
 
															         lines.append(current_line)
														
 
															-        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
														
 
															+        if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
														
 
															             text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
														
 
															     for line in text_inline_lines:
														
 
															         # 按照x0坐标排序
														
@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
 
															                     span['bbox'], (0, y0, 0, y1)):
														
 
															                 # 调整公式类型
														
 
															-                if span["type"] == "displayed_equation":
														
 
															+                if span["type"] == ContentType.InterlineEquation:
														
 
															                     # 最后一行是行间公式
														
 
															                     if j + 1 >= len(text_inline_lines):
														
 
															-                        span["type"] = "inline_equation"
														
 
															+                        span["type"] = ContentType.InlineEquation
														
 
															                         span["bbox"][1] = y0
														
 
															                         span["bbox"][3] = y1
														
 
															                     else:
														
@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
 
															                         y0_next, y1_next = text_inline_lines[j + 1][1]
														
 
															                         if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
														
 
															                                 y1 - y0) > span_y - span_y0:
														
 
															-                            span["type"] = "inline_equation"
														
 
															+                            span["type"] = ContentType.InlineEquation
														
 
															                             span["bbox"][1] = y0
														
 
															                             span["bbox"][3] = y1
														
 
															                 break
														
@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
 
															     for block in blocks:
														
 
															         for line in block["lines"]:
														
 
															             for span in line["spans"]:
														
 
															-                if span["type"] == "image":
														
 
															+                if span["type"] == ContentType.Image:
														
 
															                     images.append(span)
														
 
															-                elif span["type"] == "table":
														
 
															+                elif span["type"] == ContentType.Table:
														
 
															                     tables.append(span)
														
 
															-                elif span["type"] == "inline_equation":
														
 
															+                elif span["type"] == ContentType.InlineEquation:
														
 
															                     inline_equations.append(span)
														
 
															-                elif span["type"] == "displayed_equation":
														
 
															+                elif span["type"] == ContentType.InterlineEquation:
														
 
															                     interline_equations.append(span)
														
 
															                 else:
														
 
															                     continue