Browse Source

ocr模式下content type 抽象

赵小蒙 1 year ago
parent
commit
26c2378271

+ 7 - 4
magic_pdf/dict2md/ocr_mkcontent.py

@@ -1,3 +1,6 @@
+from magic_pdf.libs.ocr_content_type import ContentType
+
+
 def mk_nlp_markdown(pdf_info_dict: dict):
 def mk_nlp_markdown(pdf_info_dict: dict):
     markdown = []
     markdown = []
 
 
@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
                     if not span.get('content'):
                     if not span.get('content'):
                         continue
                         continue
                     content = span['content'].replace('$', '\$')  # 转义$
                     content = span['content'].replace('$', '\$')  # 转义$
-                    if span['type'] == 'inline_equation':
+                    if span['type'] == ContentType.InlineEquation:
                         content = f"${content}$"
                         content = f"${content}$"
-                    elif span['type'] == 'displayed_equation':
+                    elif span['type'] == ContentType.InterlineEquation:
                         content = f"$$\n{content}\n$$"
                         content = f"$$\n{content}\n$$"
                     line_text += content + ' '
                     line_text += content + ' '
                 # 在行末添加两个空格以强制换行
                 # 在行末添加两个空格以强制换行
@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
                             content = f"![]({span['image_path']})"
                             content = f"![]({span['image_path']})"
                     else:
                     else:
                         content = span['content'].replace('$', '\$')  # 转义$
                         content = span['content'].replace('$', '\$')  # 转义$
-                        if span['type'] == 'inline_equation':
+                        if span['type'] == ContentType.InlineEquation:
                             content = f"${content}$"
                             content = f"${content}$"
-                        elif span['type'] == 'displayed_equation':
+                        elif span['type'] == ContentType.InterlineEquation:
                             content = f"$$\n{content}\n$$"
                             content = f"$$\n{content}\n$$"
                     line_text += content + ' '
                     line_text += content + ' '
                 # 在行末添加两个空格以强制换行
                 # 在行末添加两个空格以强制换行

+ 10 - 8
magic_pdf/libs/draw_bbox.py

@@ -1,4 +1,6 @@
 from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.libs.ocr_content_type import ContentType
+
 
 
 def draw_bbox_without_number(i, bbox_list, page, rgb_config):
 def draw_bbox_without_number(i, bbox_list, page, rgb_config):
     new_rgb = []
     new_rgb = []
@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
 def draw_text_bbox(pdf_info_dict, input_path, out_path):
 def draw_text_bbox(pdf_info_dict, input_path, out_path):
     text_list = []
     text_list = []
     inline_equation_list = []
     inline_equation_list = []
-    displayed_equation_list = []
+    interline_equation_list = []
     for page in pdf_info_dict.values():
     for page in pdf_info_dict.values():
         page_text_list = []
         page_text_list = []
         page_inline_equation_list = []
         page_inline_equation_list = []
-        page_displayed_equation_list = []
+        page_interline_equation_list = []
         for block in page['preproc_blocks']:
         for block in page['preproc_blocks']:
             for line in block['lines']:
             for line in block['lines']:
                 for span in line['spans']:
                 for span in line['spans']:
-                    if span['type'] == 'text':
+                    if span['type'] == ContentType.Text:
                         page_text_list.append(span['bbox'])
                         page_text_list.append(span['bbox'])
-                    elif span['type'] == 'inline_equation':
+                    elif span['type'] == ContentType.InlineEquation:
                         page_inline_equation_list.append(span['bbox'])
                         page_inline_equation_list.append(span['bbox'])
-                    elif span['type'] == 'displayed_equation':
-                        page_displayed_equation_list.append(span['bbox'])
+                    elif span['type'] == ContentType.InterlineEquation:
+                        page_interline_equation_list.append(span['bbox'])
         text_list.append(page_text_list)
         text_list.append(page_text_list)
         inline_equation_list.append(page_inline_equation_list)
         inline_equation_list.append(page_inline_equation_list)
-        displayed_equation_list.append(page_displayed_equation_list)
+        interline_equation_list.append(page_interline_equation_list)
 
 
     doc = fitz.open(input_path)
     doc = fitz.open(input_path)
     for i, page in enumerate(doc):
     for i, page in enumerate(doc):
         # 获取当前页面的数据
         # 获取当前页面的数据
         draw_bbox_without_number(i, text_list, page, [255, 0, 0])
         draw_bbox_without_number(i, text_list, page, [255, 0, 0])
         draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
         draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
-        draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255])
+        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
 
 
     # Save the PDF
     # Save the PDF
     doc.save(f"{out_path}/text.pdf")
     doc.save(f"{out_path}/text.pdf")

+ 7 - 0
magic_pdf/libs/ocr_content_type.py

@@ -0,0 +1,7 @@
+class ContentType:
+    Image = "image"
+    Table = "table"
+    Text = "text"
+    InlineEquation = "inline_equation"
+    InterlineEquation = "interline_equation"
+

+ 12 - 11
magic_pdf/pdf_parse_by_ocr.py

@@ -14,6 +14,7 @@ from magic_pdf.libs.commons import (
     get_docx_model_output,
     get_docx_model_output,
 )
 )
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
+from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
         'tables': tables,
         'tables': tables,
         'interline_equations': interline_equations,
         'interline_equations': interline_equations,
         'inline_equations': inline_equations,
         'inline_equations': inline_equations,
-        'dropped_text_block': dropped_text_block,
-        'dropped_image_block': dropped_image_block,
-        'dropped_table_block': dropped_table_block,
-        'dropped_bboxes': need_remove_spans_bboxes_dict,
+        'droped_text_block': dropped_text_block,
+        'droped_image_block': dropped_image_block,
+        'droped_table_block': dropped_table_block,
+        'droped_bboxes': need_remove_spans_bboxes_dict,
     }
     }
     return return_dict
     return return_dict
 
 
@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
                 #  1: 'image', # 图片
                 #  1: 'image', # 图片
                 #  7: 'table',       # 表格
                 #  7: 'table',       # 表格
                 #  13: 'inline_equation',     # 行内公式
                 #  13: 'inline_equation',     # 行内公式
-                #  14: 'displayed_equation',      # 行间公式
+                #  14: 'interline_equation',      # 行间公式
                 #  15: 'text',      # ocr识别文本
                 #  15: 'text',      # ocr识别文本
                 """layout信息"""
                 """layout信息"""
                 #  11: 'full column',   # 单栏
                 #  11: 'full column',   # 单栏
@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
                     "bbox": bbox,
                     "bbox": bbox,
                 }
                 }
                 if category_id == 1:
                 if category_id == 1:
-                    span["type"] = "image"
+                    span["type"] = ContentType.Image
 
 
                 elif category_id == 7:
                 elif category_id == 7:
-                    span["type"] = "table"
+                    span["type"] = ContentType.Table
 
 
                 elif category_id == 13:
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]
                     span["content"] = layout_det["latex"]
-                    span["type"] = "inline_equation"
+                    span["type"] = ContentType.InlineEquation
                 elif category_id == 14:
                 elif category_id == 14:
                     span["content"] = layout_det["latex"]
                     span["content"] = layout_det["latex"]
-                    span["type"] = "displayed_equation"
+                    span["type"] = ContentType.InterlineEquation
                 elif category_id == 15:
                 elif category_id == 15:
                     span["content"] = layout_det["text"]
                     span["content"] = layout_det["text"]
-                    span["type"] = "text"
+                    span["type"] = ContentType.Text
                 # print(span)
                 # print(span)
                 spans.append(span)
                 spans.append(span)
             else:
             else:
@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
         # bbox去除粘连
         # bbox去除粘连
         spans = remove_overlap_between_bbox(spans)
         spans = remove_overlap_between_bbox(spans)
 
 
-        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+        # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
         spans = adjust_bbox_for_standalone_block(spans)
         spans = adjust_bbox_for_standalone_block(spans)
 
 
 
 

+ 3 - 2
magic_pdf/pre_proc/ocr_cut_image.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.pdf_image_tools import cut_image
 from magic_pdf.libs.pdf_image_tools import cut_image
 
 
 
 
@@ -11,9 +12,9 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
 
 
     for span in spans:
     for span in spans:
         span_type = span['type']
         span_type = span['type']
-        if span_type == 'image':
+        if span_type == ContentType.Image:
             span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
             span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
-        elif span_type == 'table':
+        elif span_type == ContentType.Table:
             span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
             span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
 
 
     return spans
     return spans

+ 4 - 3
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -2,6 +2,7 @@ from loguru import logger
 
 
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
     calculate_overlap_area_in_bbox1_area_ratio
     calculate_overlap_area_in_bbox1_area_ratio
+from magic_pdf.libs.ocr_content_type import ContentType
 
 
 
 
 # 将每一个line中的span从左到右排序
 # 将每一个line中的span从左到右排序
@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
     lines = []
     lines = []
     current_line = [spans[0]]
     current_line = [spans[0]]
     for span in spans[1:]:
     for span in spans[1:]:
-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
         # image和table类型,同上
         # image和table类型,同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(
-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
             # 则开始新行
             # 则开始新行
             lines.append(current_line)
             lines.append(current_line)
             current_line = [span]
             current_line = [span]

+ 21 - 20
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -2,6 +2,7 @@ from loguru import logger
 
 
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
     __is_overlaps_y_exceeds_threshold
     __is_overlaps_y_exceeds_threshold
+from magic_pdf.libs.ocr_content_type import ContentType
 
 
 
 
 def remove_overlaps_min_spans(spans):
 def remove_overlaps_min_spans(spans):
@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
         for span in need_remove_spans:
         for span in need_remove_spans:
             spans.remove(span)
             spans.remove(span)
             span['tag'] = drop_tag
             span['tag'] = drop_tag
-            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
+            if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
                 dropped_text_block.append(span)
                 dropped_text_block.append(span)
-            elif span['type'] == 'image':
+            elif span['type'] == ContentType.Image:
                 dropped_image_block.append(span)
                 dropped_image_block.append(span)
-            elif span['type'] == 'table':
+            elif span['type'] == ContentType.Table:
                 dropped_table_block.append(span)
                 dropped_table_block.append(span)
 
 
     return spans, dropped_text_block, dropped_image_block, dropped_table_block
     return spans, dropped_text_block, dropped_image_block, dropped_table_block
 
 
 
 
 def adjust_bbox_for_standalone_block(spans):
 def adjust_bbox_for_standalone_block(spans):
-    # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+    # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
     for sb_span in spans:
     for sb_span in spans:
-        if sb_span['type'] in ["displayed_equation", "image", "table"]:
+        if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
             for text_span in spans:
             for text_span in spans:
-                if text_span['type'] in ['text', 'inline_equation']:
+                if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
                     # 判断span2的纵向高度是否被span所覆盖
                     # 判断span2的纵向高度是否被span所覆盖
                     if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
                     if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
                         # 判断span2是否在span左边
                         # 判断span2是否在span左边
@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
 
     lines = []
     lines = []
     current_line = [spans[0]]
     current_line = [spans[0]]
-    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
+    if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
         displayed_list.append(spans[0])
         displayed_list.append(spans[0])
 
 
     line_first_y0 = spans[0]["bbox"][1]
     line_first_y0 = spans[0]["bbox"][1]
@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
     for span in spans[1:]:
     for span in spans[1:]:
         # if span.get("content","") == "78.":
         # if span.get("content","") == "78.":
         #     print("debug")
         #     print("debug")
-        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
+        # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
         # image和table类型,同上
         # image和table类型,同上
-        if span['type'] in ["displayed_equation", "image", "table"] or any(
-                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
             # 传入
             # 传入
-            if span["type"] in ["displayed_equation", "image", "table"]:
+            if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
                 displayed_list.append(span)
                 displayed_list.append(span)
             # 则开始新行
             # 则开始新行
             lines.append(current_line)
             lines.append(current_line)
-            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
                 text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
                 text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
             current_line = [span]
             current_line = [span]
             line_first_y0 = span["bbox"][1]
             line_first_y0 = span["bbox"][1]
@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
         # 添加最后一行
         # 添加最后一行
     if current_line:
     if current_line:
         lines.append(current_line)
         lines.append(current_line)
-        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
+        if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
             text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
             text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
     for line in text_inline_lines:
     for line in text_inline_lines:
         # 按照x0坐标排序
         # 按照x0坐标排序
@@ -159,10 +160,10 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
                     span['bbox'], (0, y0, 0, y1)):
                     span['bbox'], (0, y0, 0, y1)):
 
 
                 # 调整公式类型
                 # 调整公式类型
-                if span["type"] == "displayed_equation":
+                if span["type"] == ContentType.InterlineEquation:
                     # 最后一行是行间公式
                     # 最后一行是行间公式
                     if j + 1 >= len(text_inline_lines):
                     if j + 1 >= len(text_inline_lines):
-                        span["type"] = "inline_equation"
+                        span["type"] = ContentType.InlineEquation
                         span["bbox"][1] = y0
                         span["bbox"][1] = y0
                         span["bbox"][3] = y1
                         span["bbox"][3] = y1
                     else:
                     else:
@@ -170,7 +171,7 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
                         y0_next, y1_next = text_inline_lines[j + 1][1]
                         y0_next, y1_next = text_inline_lines[j + 1][1]
                         if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
                         if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
                                 y1 - y0) > span_y - span_y0:
                                 y1 - y0) > span_y - span_y0:
-                            span["type"] = "inline_equation"
+                            span["type"] = ContentType.InlineEquation
                             span["bbox"][1] = y0
                             span["bbox"][1] = y0
                             span["bbox"][3] = y1
                             span["bbox"][3] = y1
                 break
                 break
@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
     for block in blocks:
     for block in blocks:
         for line in block["lines"]:
         for line in block["lines"]:
             for span in line["spans"]:
             for span in line["spans"]:
-                if span["type"] == "image":
+                if span["type"] == ContentType.Image:
                     images.append(span)
                     images.append(span)
-                elif span["type"] == "table":
+                elif span["type"] == ContentType.Table:
                     tables.append(span)
                     tables.append(span)
-                elif span["type"] == "inline_equation":
+                elif span["type"] == ContentType.InlineEquation:
                     inline_equations.append(span)
                     inline_equations.append(span)
-                elif span["type"] == "displayed_equation":
+                elif span["type"] == ContentType.InterlineEquation:
                     interline_equations.append(span)
                     interline_equations.append(span)
                 else:
                 else:
                     continue
                     continue