Эх сурвалжийг харах

refactor(pre_proc): clean up OCR processing code

- Remove commented-out code in ocr_dict_merge.py
- Improve imports and code organization in ocr_detect_all_bboxes.py
- Delete unnecessary empty lines and improve code readability
myhloli 11 сар өмнө
parent
commit
a46b12e967

+ 5 - 5
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -1,11 +1,11 @@
-
 from magic_pdf.config.ocr_content_type import BlockType
 from magic_pdf.libs.boxbase import (
-    calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
+    calculate_iou,
+    calculate_overlap_area_in_bbox1_area_ratio,
     calculate_vertical_projection_overlap_ratio,
-    get_minbox_if_overlap_by_ratio)
-from magic_pdf.pre_proc.remove_bbox_overlap import \
-    remove_overlap_between_bbox_for_block
+    get_minbox_if_overlap_by_ratio
+)
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
 
 
 def add_bboxes(blocks, block_type, bboxes):

+ 0 - 12
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -1,4 +1,3 @@
-
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
 
@@ -82,14 +81,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
             if calculate_overlap_area_in_bbox1_area_ratio(
                     span_bbox, block_bbox) > radio:
                 block_spans.append(span)
-        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
-        # displayed_list = []
-        # text_inline_lines = []
-        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
-        '''模型识别错误的行间公式, type类型转换成行内公式'''
-        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
-        '''bbox去除粘连'''  # 去粘连会影响span的bbox,导致后续fill的时候出错
-        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
 
         block_dict['spans'] = block_spans
         block_with_spans.append(block_dict)
@@ -103,9 +94,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
 
 
 def fix_block_spans_v2(block_with_spans):
-    """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
-    需要将caption和footnote的text_span放入相应img_block和table_block内的
-    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
     fix_blocks = []
     for block in block_with_spans:
         block_type = block['type']