Преглед изворни кода

refactor(drawing): simplify draw bbox functions and adjust debug config

Refactor the draw bbox functions by removing unused imports and simplifying the
code logic for drawing layout and line sorting bounding boxes. Adjust the debug
configuration to enable content list dumping and disable markdown making mode.
myhloli пре 1 година
родитељ
комит
b2790f6f45
3 измењених фајлова са 9 додато и 26 уклоњено
  1. 2 7
      magic_pdf/libs/draw_bbox.py
  2. 4 16
      magic_pdf/pdf_parse_union_core_v2.py
  3. 3 3
      magic_pdf/tools/common.py

+ 2 - 7
magic_pdf/libs/draw_bbox.py

@@ -1,7 +1,3 @@
-import time
-
-import torch
-
 from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.libs.Constants import CROSS_PAGE
 from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
@@ -335,16 +331,15 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
 def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
     layout_bbox_list = []
 
-    from loguru import logger
     for page in pdf_info:
         page_line_list = []
         for block in page['preproc_blocks']:
-            if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
+            if block['type'] in ['text', 'title', 'interline_equation']:
                 for line in block['lines']:
                     bbox = line['bbox']
                     index = line['index']
                     page_line_list.append({'index': index, 'bbox': bbox})
-            if block['type'] == 'table' or block['type'] == 'image':
+            if block['type'] in ['table', 'image']:
                 bbox = block['bbox']
                 index = block['index']
                 page_line_list.append({'index': index, 'bbox': bbox})

+ 4 - 16
magic_pdf/pdf_parse_union_core_v2.py

@@ -14,7 +14,6 @@ from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.local_math import float_equal
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.model.magic_model import MagicModel
-from magic_pdf.para.para_split_v2 import para_split
 from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
@@ -153,10 +152,6 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
             interline_equations, page_w, page_h)
 
-    # if len(drop_reasons) > 0:
-    #     need_drop = True
-    #     drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
-
     '''先处理不需要排版的discarded_blocks'''
     discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
     fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
@@ -177,11 +172,11 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     '''获取所有line并对line排序'''
     page_line_list = []
     for block in fix_blocks:
-        if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
+        if block['type'] in ['text', 'title', 'interline_equation']:
             for line in block['lines']:
                 bbox = line['bbox']
                 page_line_list.append(bbox)
-        elif block['type'] == 'table' or block['type'] == 'image':  # 简单的把表和图都当成一个line处理
+        elif block['type'] in ['table', 'image']:  # 简单的把表和图都当成一个line处理
             bbox = block['bbox']
             page_line_list.append(bbox)
 
@@ -201,32 +196,25 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
         boxes.append([left, top, right, bottom])
     layoutreader_start = time.time()
     orders = do_predict(boxes)
-    # if torch.cuda.is_available():
-    #     torch.cuda.empty_cache()
-    # print(orders)
     logger.info(f"layoutreader cost time{time.time() - layoutreader_start}")
     sorted_bboxes = [page_line_list[i] for i in orders]
 
     '''根据line的中位数算block的序列关系'''
     block_without_lines = []
     for block in fix_blocks:
-        if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
+        if block['type'] in ['text', 'title', 'interline_equation']:
             line_index_list = []
             if len(block['lines']) == 0:
                 block_without_lines.append(block)
                 continue
             else:
                 for line in block['lines']:
-                    # for line_bbox in sorted_bboxes:
-                    #     if line['bbox'] == line_bbox:
                     line['index'] = sorted_bboxes.index(line['bbox'])
                     line_index_list.append(line['index'])
                 median_value = statistics.median(line_index_list)
                 block['index'] = median_value
 
-        elif block['type'] == 'table' or block['type'] == 'image':
-            # for line_bbox in sorted_bboxes:
-            #     if block['bbox'] == line_bbox:
+        elif block['type'] in ['table', 'image']:
             block['index'] = sorted_bboxes.index(block['bbox'])
 
     '''移除没有line的block'''

+ 3 - 3
magic_pdf/tools/common.py

@@ -7,7 +7,7 @@ from loguru import logger
 
 import magic_pdf.model as model_config
 from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
-                                      draw_model_bbox, draw_layout_sort_bbox, draw_line_sort_bbox)
+                                      draw_model_bbox, draw_line_sort_bbox)
 from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.pipe.OCRPipe import OCRPipe
 from magic_pdf.pipe.TXTPipe import TXTPipe
@@ -39,7 +39,7 @@ def do_parse(
     f_dump_middle_json=True,
     f_dump_model_json=True,
     f_dump_orig_pdf=True,
-    f_dump_content_list=False,
+    f_dump_content_list=True,
     f_make_md_mode=MakeMode.MM_MD,
     f_draw_model_bbox=False,
     f_draw_line_sort_bbox=False,
@@ -49,7 +49,7 @@ def do_parse(
 ):
     if debug_able:
         logger.warning('debug mode is on')
-        f_dump_content_list = True
+        # f_dump_content_list = True
         f_draw_model_bbox = True
         f_draw_line_sort_bbox = True