Jelajahi Sumber

fix: improve block filtering and cleanup in para_split.py

myhloli 4 bulan lalu
induk
melakukan
05c6f0b152
1 mengubah file dengan 6 tambahan dan 6 penghapusan
  1. 6 6
      mineru/backend/pipeline/para_split.py

+ 6 - 6
mineru/backend/pipeline/para_split.py

@@ -365,12 +365,12 @@ def para_split(page_info_list):
     for page_info in page_info_list:
         page_info['para_blocks'] = []
         for block in all_blocks:
-            if block['page_num'] == page_info['page_idx']:
-                page_info['para_blocks'].append(block)
-
-            # 从block中删除不需要的page_num和page_size字段
-            del block['page_num']
-            del block['page_size']
+            if 'page_num' in block:
+                if block['page_num'] == page_info['page_idx']:
+                    page_info['para_blocks'].append(block)
+                    # 从block中删除不需要的page_num和page_size字段
+                    del block['page_num']
+                    del block['page_size']
 
 
 if __name__ == '__main__':