3 ヶ月前 · c3db578247
--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -9,7 +9,7 @@ import pypdfium2 as pdfium
 
				 from loguru import logger
			
 
				 
			
 
				 from mineru.data.data_reader_writer import FileBasedDataWriter
			
 
				-from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
			
 
				+from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
			
 
				 from mineru.utils.enum_class import MakeMode
			
 
				 from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
			
 
				 from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
			
@@ -102,6 +102,7 @@ def _process_output(
 
				         model_output=None,
			
 
				         is_pipeline=True
			
 
				 ):
			
 
				+    f_draw_line_sort_bbox = False
			
 
				     from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
			
 
				     """处理输出文件"""
			
 
				     if f_draw_layout_bbox:
			
@@ -116,6 +117,9 @@ def _process_output(
 
				             pdf_bytes,
			
 
				         )
			
 
				 
			
 
				+    if f_draw_line_sort_bbox:
			
 
				+        draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_line_sort.pdf")
			
 
				+
			
 
				     image_dir = str(os.path.basename(local_image_dir))
			
 
				 
			
 
				     if f_dump_md:
			
--- a/mineru/utils/draw_bbox.py
+++ b/mineru/utils/draw_bbox.py
@@ -381,6 +381,87 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
 
				         output_pdf.write(f)
			
 
				 
			
 
				 
			
 
				+def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
			
 
				+    layout_bbox_list = []
			
 
				+
			
 
				+    for page in pdf_info:
			
 
				+        page_line_list = []
			
 
				+        for block in page['preproc_blocks']:
			
 
				+            if block['type'] in [BlockType.TEXT]:
			
 
				+                for line in block['lines']:
			
 
				+                    bbox = line['bbox']
			
 
				+                    index = line['index']
			
 
				+                    page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+            elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
			
 
				+                if 'virtual_lines' in block:
			
 
				+                    if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
			
 
				+                        for line in block['virtual_lines']:
			
 
				+                            bbox = line['bbox']
			
 
				+                            index = line['index']
			
 
				+                            page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+                else:
			
 
				+                    for line in block['lines']:
			
 
				+                        bbox = line['bbox']
			
 
				+                        index = line['index']
			
 
				+                        page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+            elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
			
 
				+                for sub_block in block['blocks']:
			
 
				+                    if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
			
 
				+                        if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
			
 
				+                            for line in sub_block['virtual_lines']:
			
 
				+                                bbox = line['bbox']
			
 
				+                                index = line['index']
			
 
				+                                page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+                        else:
			
 
				+                            for line in sub_block['lines']:
			
 
				+                                bbox = line['bbox']
			
 
				+                                index = line['index']
			
 
				+                                page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+                    elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]:
			
 
				+                        for line in sub_block['lines']:
			
 
				+                            bbox = line['bbox']
			
 
				+                            index = line['index']
			
 
				+                            page_line_list.append({'index': index, 'bbox': bbox})
			
 
				+        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
			
 
				+        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
			
 
				+    pdf_bytes_io = BytesIO(pdf_bytes)
			
 
				+    pdf_docs = PdfReader(pdf_bytes_io)
			
 
				+    output_pdf = PdfWriter()
			
 
				+
			
 
				+    for i, page in enumerate(pdf_docs.pages):
			
 
				+        # 获取原始页面尺寸
			
 
				+        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
			
 
				+        custom_page_size = (page_width, page_height)
			
 
				+
			
 
				+        packet = BytesIO()
			
 
				+        # 使用原始PDF的尺寸创建canvas
			
 
				+        c = canvas.Canvas(packet, pagesize=custom_page_size)
			
 
				+
			
 
				+        # 获取当前页面的数据
			
 
				+        draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False)
			
 
				+
			
 
				+        c.save()
			
 
				+        packet.seek(0)
			
 
				+        overlay_pdf = PdfReader(packet)
			
 
				+
			
 
				+        # 添加检查确保overlay_pdf.pages不为空
			
 
				+        if len(overlay_pdf.pages) > 0:
			
 
				+            new_page = PageObject(pdf=None)
			
 
				+            new_page.update(page)
			
 
				+            page = new_page
			
 
				+            page.merge_page(overlay_pdf.pages[0])
			
 
				+        else:
			
 
				+            # 记录日志并继续处理下一个页面
			
 
				+            # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
			
 
				+            pass
			
 
				+
			
 
				+        output_pdf.add_page(page)
			
 
				+
			
 
				+    # Save the PDF
			
 
				+    with open(f"{out_path}/{filename}", "wb") as f:
			
 
				+        output_pdf.write(f)
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     # 读取PDF文件
			
 
				     pdf_path = "examples/demo1.pdf"