Преглед на файлове

refactor: simplify content list handling by consolidating layout and discarded paragraphs

myhloli преди 2 седмици
родител
ревизия
23d75bac09
променени са 2 файла, в които са добавени 4 реда и са изтрити 6 реда
  1. 2 3
      mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
  2. 2 3
      mineru/backend/vlm/vlm_middle_json_mkcontent.py

+ 2 - 3
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py

@@ -286,9 +286,8 @@ def union_make(pdf_info_dict: list,
             page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            if not paras_of_layout + paras_of_discarded:
-                continue
-            for para_block in paras_of_layout + paras_of_discarded:
+            para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
+            for para_block in para_blocks:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 if para_content:
                     output_content.append(para_content)

+ 2 - 3
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -254,9 +254,8 @@ def union_make(pdf_info_dict: list,
             page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            if not paras_of_layout + paras_of_discarded:
-                continue
-            for para_block in paras_of_layout + paras_of_discarded:
+            para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
+            for para_block in para_blocks:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 output_content.append(para_content)