1 年之前 · 34f8965007
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -334,7 +334,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
 
				     return parse_logits(logits, len(boxes))
			
 
				 
			
 
				 
			
 
				-def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
			
 
				+def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
			
 
				     layout_bbox_list = []
			
 
				 
			
 
				     from loguru import logger
			
@@ -344,35 +344,30 @@ def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
 
				             if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
			
 
				                 for line in block['lines']:
			
 
				                     bbox = line['bbox']
			
 
				-                    page_line_list.append(bbox)
			
 
				+                    index = line['index']
			
 
				+                    page_line_list.append({'index': index, 'bbox': bbox})
			
 
				             if block['type'] == 'table' or block['type'] == 'image':
			
 
				                 bbox = block['bbox']
			
 
				-                page_line_list.append(bbox)
			
 
				-
			
 
				-        # 使用layoutreader排序
			
 
				-        page_size = page['page_size']
			
 
				-        x_scale = 1000.0 / page_size[0]
			
 
				-        y_scale = 1000.0 / page_size[1]
			
 
				-        boxes = []
			
 
				-        logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
			
 
				-        for left, top, right, bottom in page_line_list:
			
 
				-            left = round(left * x_scale)
			
 
				-            top = round(top * y_scale)
			
 
				-            right = round(right * x_scale)
			
 
				-            bottom = round(bottom * y_scale)
			
 
				-            assert (
			
 
				-                    1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
			
 
				-            ), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
			
 
				-            boxes.append([left, top, right, bottom])
			
 
				-        logger.info("layoutreader start")
			
 
				-        start = time.time()
			
 
				-        orders = do_predict(boxes)
			
 
				-        if torch.cuda.is_available():
			
 
				-            torch.cuda.empty_cache()
			
 
				-        print(orders)
			
 
				-        logger.info(f"layoutreader end, cos time{time.time() - start}")
			
 
				-        sorted_bboxes = [page_line_list[i] for i in orders]
			
 
				-        layout_bbox_list.append(sorted_bboxes)
			
 
				+                index = block['index']
			
 
				+                page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
			
 
				+        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
			
 
				+    pdf_docs = fitz.open('pdf', pdf_bytes)
			
 
				+    for i, page in enumerate(pdf_docs):
			
 
				+        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
			
 
				+
			
 
				+    pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
			
 
				+
			
 
				+
			
 
				+def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
			
 
				+    layout_bbox_list = []
			
 
				+
			
 
				+    for page in pdf_info:
			
 
				+        page_block_list = []
			
 
				+        for block in page['para_blocks']:
			
 
				+            bbox = block['bbox']
			
 
				+            page_block_list.append(bbox)
			
 
				+        layout_bbox_list.append(page_block_list)
			
 
				     pdf_docs = fitz.open('pdf', pdf_bytes)
			
 
				     for i, page in enumerate(pdf_docs):
			
 
				         draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
			
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -99,7 +99,7 @@ def do_predict(boxes: List[List[int]]) -> List[int]:
 
				     from transformers import LayoutLMv3ForTokenClassification
			
 
				     from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
			
 
				     model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
			
 
				-    model.to("cuda")
			
 
				+    # model.to("cuda")
			
 
				     inputs = boxes2inputs(boxes)
			
 
				     inputs = prepare_inputs(inputs, model)
			
 
				     logits = model(**inputs).logits.cpu().squeeze(0)
			
@@ -145,17 +145,17 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
 
				     # interline_equation_blocks参数不够准，后面切换到interline_equations上
			
 
				     interline_equation_blocks = []
			
 
				     if len(interline_equation_blocks) > 0:
			
 
				-        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2(
			
 
				+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
			
 
				             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
			
 
				             interline_equation_blocks, page_w, page_h)
			
 
				     else:
			
 
				-        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split_v2(
			
 
				+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
			
 
				             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
			
 
				             interline_equations, page_w, page_h)
			
 
				 
			
 
				-    if len(drop_reasons) > 0:
			
 
				-        need_drop = True
			
 
				-        drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
			
 
				+    # if len(drop_reasons) > 0:
			
 
				+    #     need_drop = True
			
 
				+    #     drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
			
 
				 
			
 
				     '''先处理不需要排版的discarded_blocks'''
			
 
				     discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
			
@@ -208,20 +208,31 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
 
				     sorted_bboxes = [page_line_list[i] for i in orders]
			
 
				 
			
 
				     '''根据line的中位数算block的序列关系'''
			
 
				-    for line_index, bbox in enumerate(sorted_bboxes):
			
 
				-        for block in fix_blocks:
			
 
				-            if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
			
 
				-                line_index_list = []
			
 
				+    block_without_lines = []
			
 
				+    for block in fix_blocks:
			
 
				+        if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
			
 
				+            line_index_list = []
			
 
				+            if len(block['lines']) == 0:
			
 
				+                block_without_lines.append(block)
			
 
				+                continue
			
 
				+            else:
			
 
				                 for line in block['lines']:
			
 
				-                    if line['bbox'] == bbox:
			
 
				-                        line['index'] = line_index
			
 
				-                        line_index_list.append(line_index)
			
 
				+                    # for line_bbox in sorted_bboxes:
			
 
				+                    #     if line['bbox'] == line_bbox:
			
 
				+                    line['index'] = sorted_bboxes.index(line['bbox'])
			
 
				+                    line_index_list.append(line['index'])
			
 
				                 median_value = statistics.median(line_index_list)
			
 
				                 block['index'] = median_value
			
 
				 
			
 
				-            elif block['type'] == 'table' or block['type'] == 'image':
			
 
				-                if block['bbox'] == bbox:
			
 
				-                    block['index'] = line_index
			
 
				+        elif block['type'] == 'table' or block['type'] == 'image':
			
 
				+            # for line_bbox in sorted_bboxes:
			
 
				+            #     if block['bbox'] == line_bbox:
			
 
				+            block['index'] = sorted_bboxes.index(block['bbox'])
			
 
				+
			
 
				+    '''移除没有line的block'''
			
 
				+    for block in block_without_lines:
			
 
				+        fix_blocks.remove(block)
			
 
				+
			
 
				     '''重排block'''
			
 
				     sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
			
 
				 
			
@@ -292,7 +303,9 @@ def pdf_parse_union(pdf_bytes,
 
				         pdf_info_dict[f"page_{page_id}"] = page_info
			
 
				 
			
 
				     """分段"""
			
 
				-    para_split(pdf_info_dict, debug_mode=debug_mode)
			
 
				+    # para_split(pdf_info_dict, debug_mode=debug_mode)
			
 
				+    for page_num, page in pdf_info_dict.items():
			
 
				+        page['para_blocks'] = page['preproc_blocks']
			
 
				 
			
 
				     """dict转list"""
			
 
				     pdf_info_list = dict_to_list(pdf_info_dict)
			
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -108,9 +108,9 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
 
				     all_bboxes = remove_overlaps_min_blocks(all_bboxes)
			
 
				     all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
			
 
				     '''将剩余的bbox做分离处理，防止后面分layout时出错'''
			
 
				-    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
			
 
				+    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
			
 
				 
			
 
				-    return all_bboxes, all_discarded_blocks, drop_reasons
			
 
				+    return all_bboxes, all_discarded_blocks
			
 
				 
			
 
				 
			
 
				 def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
			
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -7,7 +7,7 @@ from loguru import logger
 
				 
			
 
				 import magic_pdf.model as model_config
			
 
				 from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
			
 
				-                                      draw_model_bbox, draw_layout_sort_bbox)
			
 
				+                                      draw_model_bbox, draw_layout_sort_bbox, draw_line_sort_bbox)
			
 
				 from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
			
 
				 from magic_pdf.pipe.OCRPipe import OCRPipe
			
 
				 from magic_pdf.pipe.TXTPipe import TXTPipe
			
@@ -94,6 +94,8 @@ def do_parse(
 
				 
			
 
				     draw_layout_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
			
 
				 
			
 
				+    draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
			
 
				+
			
 
				     md_content = pipe.pipe_mk_markdown(image_dir,
			
 
				                                        drop_mode=DropMode.NONE,
			
 
				                                        md_make_mode=f_make_md_mode)