浏览代码

feat(pdf_parse): add internal block sorting for images and tables

- Implement block sorting within image and table blocks
- Ensure correct order of captions and footnotes within blocks
- Improve overall document structure and parsing accuracy
myhloli 10 月之前
父节点
当前提交
3f93b895bc
共有 1 个文件被更改,包括 5 次插入0 次删除
  1. 5 0
      magic_pdf/pdf_parse_union_core_v2.py

+ 5 - 0
magic_pdf/pdf_parse_union_core_v2.py

@@ -768,6 +768,11 @@ def parse_page_core(
     """重排block"""
     sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
 
+    """block内重排(img和table的block内多个caption或footnote的排序)"""
+    for block in sorted_blocks:
+        if block['type'] in [BlockType.Image, BlockType.Table]:
+            block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
+
     """获取QA需要外置的list"""
     images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)