1 year ago · 85587b257b
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -22,20 +22,30 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
 
				 from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
			
 
				 from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
			
 
				 from magic_pdf.pre_proc.ocr_dict_merge import (
			
 
				-    merge_spans_to_line_by_layout,
			
 
				+    merge_spans_to_line_by_layout, merge_lines_to_block,
			
 
				 )
			
 
				 from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
			
 
				-    adjust_bbox_for_standalone_block,modify_y_axis,modify_inline_equation
			
 
				+    adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
			
 
				+    remove_spans_by_bboxes_dict
			
 
				 from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
			
 
				 
			
 
				 
			
 
				-def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree):
			
 
				+def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
			
 
				+                             images, tables, interline_equations, inline_equations,
			
 
				+                             dropped_text_block, dropped_image_block, dropped_table_block):
			
 
				     return_dict = {
			
 
				         'preproc_blocks': blocks,
			
 
				         'layout_bboxes': layout_bboxes,
			
 
				         'page_idx': page_id,
			
 
				         'page_size': [page_w, page_h],
			
 
				         '_layout_tree': layout_tree,
			
 
				+        'images': images,
			
 
				+        'tables': tables,
			
 
				+        'interline_equations': interline_equations,
			
 
				+        'inline_equations': inline_equations,
			
 
				+        'dropped_text_block': dropped_text_block,
			
 
				+        'dropped_image_block': dropped_image_block,
			
 
				+        'dropped_table_block': dropped_table_block,
			
 
				     }
			
 
				     return return_dict
			
 
				 
			
@@ -79,7 +89,6 @@ def parse_pdf_by_ocr(
 
				 
			
 
				     start_time = time.time()
			
 
				 
			
 
				-    remove_bboxes = []
			
 
				 
			
 
				     end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
			
 
				     for page_id in range(start_page_id, end_page_id + 1):
			
@@ -111,11 +120,19 @@ def parse_pdf_by_ocr(
 
				         )
			
 
				 
			
 
				         # 构建需要remove的bbox列表
			
 
				-        need_remove_spans_bboxes = []
			
 
				-        need_remove_spans_bboxes.extend(page_no_bboxes)
			
 
				-        need_remove_spans_bboxes.extend(header_bboxes)
			
 
				-        need_remove_spans_bboxes.extend(footer_bboxes)
			
 
				-        need_remove_spans_bboxes.extend(footnote_bboxes)
			
 
				+        # need_remove_spans_bboxes = []
			
 
				+        # need_remove_spans_bboxes.extend(page_no_bboxes)
			
 
				+        # need_remove_spans_bboxes.extend(header_bboxes)
			
 
				+        # need_remove_spans_bboxes.extend(footer_bboxes)
			
 
				+        # need_remove_spans_bboxes.extend(footnote_bboxes)
			
 
				+
			
 
				+        # 构建需要remove的bbox字典
			
 
				+        need_remove_spans_bboxes_dict = {
			
 
				+            "page_no": page_no_bboxes,
			
 
				+            "header": header_bboxes,
			
 
				+            "footer": footer_bboxes,
			
 
				+            "footnote": footnote_bboxes,
			
 
				+        }
			
 
				 
			
 
				         layout_dets = ocr_page_info["layout_dets"]
			
 
				         spans = []
			
@@ -177,7 +194,9 @@ def parse_pdf_by_ocr(
 
				         spans = remove_overlaps_min_spans(spans)
			
 
				 
			
 
				         # 删除remove_span_block_bboxes中的bbox
			
 
				-        spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
			
 
				+        # spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
			
 
				+        # 按qa要求，增加drop相关数据
			
 
				+        spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
			
 
				 
			
 
				         # 对image和table截图
			
 
				         spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
			
@@ -202,18 +221,19 @@ def parse_pdf_by_ocr(
 
				         # 将spans合并成line(在layout内,从上到下,从左到右)
			
 
				         lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
			
 
				 
			
 
				-        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
			
 
				-        blocks = []
			
 
				-        for line in lines:
			
 
				-            blocks.append(
			
 
				-                {
			
 
				-                    "bbox": line["bbox"],
			
 
				-                    "lines": [line],
			
 
				-                }
			
 
				-            )
			
 
				+        # 将lines合并成block
			
 
				+        blocks = merge_lines_to_block(lines)
			
 
				+
			
 
				+        # 根据block合并段落
			
 
				+
			
 
				+
			
 
				+        # 获取QA需要外置的list
			
 
				+        images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
			
 
				 
			
 
				         # 构造pdf_info_dict
			
 
				-        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree)
			
 
				+        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
			
 
				+                                             images, tables, interline_equations, inline_equations,
			
 
				+                                             dropped_text_block, dropped_image_block, dropped_table_block)
			
 
				         pdf_info_dict[f"page_{page_id}"] = page_info
			
 
				 
			
 
				     # 在测试时,保存调试信息
			
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -80,6 +80,19 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
 
				     return lines
			
 
				 
			
 
				 
			
 
				+def merge_lines_to_block(lines):
			
 
				+    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
			
 
				+    blocks = []
			
 
				+    for line in lines:
			
 
				+        blocks.append(
			
 
				+            {
			
 
				+                "bbox": line["bbox"],
			
 
				+                "lines": [line],
			
 
				+            }
			
 
				+        )
			
 
				+    return blocks
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -1,3 +1,5 @@
 
				+from loguru import logger
			
 
				+
			
 
				 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
			
 
				     __is_overlaps_y_exceeds_threshold
			
 
				 
			
@@ -31,6 +33,32 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
 
				     return spans
			
 
				 
			
 
				 
			
 
				+def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
			
 
				+    dropped_text_block = []
			
 
				+    dropped_image_block = []
			
 
				+    dropped_table_block = []
			
 
				+    for key, value in need_remove_spans_bboxes_dict.items():
			
 
				+        # logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
			
 
				+        need_remove_spans = []
			
 
				+        for span in spans:
			
 
				+            for removed_bbox in value:
			
 
				+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
			
 
				+                    need_remove_spans.append(span)
			
 
				+                    break
			
 
				+
			
 
				+        for span in need_remove_spans:
			
 
				+            spans.remove(span)
			
 
				+            span['tag'] = key
			
 
				+            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
			
 
				+                dropped_text_block.append(span)
			
 
				+            elif span['type'] == 'image':
			
 
				+                dropped_image_block.append(span)
			
 
				+            elif span['type'] == 'table':
			
 
				+                dropped_table_block.append(span)
			
 
				+
			
 
				+    return spans, dropped_text_block, dropped_image_block, dropped_table_block
			
 
				+
			
 
				+
			
 
				 def adjust_bbox_for_standalone_block(spans):
			
 
				     # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
			
 
				     for sb_span in spans:
			
@@ -46,7 +74,6 @@ def adjust_bbox_for_standalone_block(spans):
 
				     return spans
			
 
				 
			
 
				 
			
 
				-
			
 
				 def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
			
 
				     # displayed_list = []
			
 
				 
			
@@ -105,8 +132,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
				         current_line = line[0]
			
 
				         current_line.sort(key=lambda span: span['bbox'][0])
			
 
				 
			
 
				-
			
 
				-    #调整每一个文字行内bbox统一
			
 
				+    # 调整每一个文字行内bbox统一
			
 
				     for line in text_inline_lines:
			
 
				         current_line, (line_first_y0, line_first_y) = line
			
 
				         for span in current_line:
			
@@ -115,8 +141,9 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
 
				 
			
 
				     # return spans, displayed_list, text_inline_lines
			
 
				 
			
 
				+
			
 
				 def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
			
 
				-    #错误行间公式转行内公式
			
 
				+    # 错误行间公式转行内公式
			
 
				     j = 0
			
 
				     for i in range(len(displayed_list)):
			
 
				         # if i == 8:
			
@@ -127,26 +154,53 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
 
				         while j < len(text_inline_lines):
			
 
				             text_line = text_inline_lines[j]
			
 
				             y0, y1 = text_line[1]
			
 
				-            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
			
 
				+            if (
			
 
				+                    span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
			
 
				+                    span['bbox'], (0, y0, 0, y1)):
			
 
				 
			
 
				-                #调整公式类型
			
 
				+                # 调整公式类型
			
 
				                 if span["type"] == "displayed_equation":
			
 
				-                    #最后一行是行间公式
			
 
				-                    if j+1 >= len(text_inline_lines):
			
 
				+                    # 最后一行是行间公式
			
 
				+                    if j + 1 >= len(text_inline_lines):
			
 
				                         span["type"] = "inline_equation"
			
 
				                         span["bbox"][1] = y0
			
 
				                         span["bbox"][3] = y1
			
 
				                     else:
			
 
				-                        #行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
			
 
				+                        # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
			
 
				                         y0_next, y1_next = text_inline_lines[j + 1][1]
			
 
				-                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3*(y1-y0) > span_y - span_y0:
			
 
				+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
			
 
				+                                y1 - y0) > span_y - span_y0:
			
 
				                             span["type"] = "inline_equation"
			
 
				                             span["bbox"][1] = y0
			
 
				                             span["bbox"][3] = y1
			
 
				                 break
			
 
				-            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
			
 
				+            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
			
 
				+                                                                                                       (0, y0, 0, y1)):
			
 
				                 break
			
 
				             else:
			
 
				                 j += 1
			
 
				 
			
 
				-    return spans
			
 
				+    return spans
			
 
				+
			
 
				+
			
 
				+def get_qa_need_list(blocks):
			
 
				+    # 创建 images, tables, interline_equations, inline_equations 的副本
			
 
				+    images = []
			
 
				+    tables = []
			
 
				+    interline_equations = []
			
 
				+    inline_equations = []
			
 
				+
			
 
				+    for block in blocks:
			
 
				+        for line in block["lines"]:
			
 
				+            for span in line["spans"]:
			
 
				+                if span["type"] == "image":
			
 
				+                    images.append(span)
			
 
				+                elif span["type"] == "table":
			
 
				+                    tables.append(span)
			
 
				+                elif span["type"] == "inline_equation":
			
 
				+                    inline_equations.append(span)
			
 
				+                elif span["type"] == "displayed_equation":
			
 
				+                    interline_equations.append(span)
			
 
				+                else:
			
 
				+                    continue
			
 
				+    return images, tables, interline_equations, inline_equations