许瑞 1 жил өмнө
parent
commit
4f1f7d62d5

+ 1 - 1
magic_pdf/pipeline.py

@@ -620,9 +620,9 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
                 jso["need_drop"] = True
                 jso["drop_reason"] = pdf_info_dict["drop_reason"]
             else:  # 正常返回,将 pdf_info_dict 压缩并存储
+                jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
                 pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
                 jso["pdf_intermediate_dict"] = pdf_info_dict
-                jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
             end_time = time.time()  # 记录完成时间
             parse_time = int(end_time - start_time)  # 计算执行时间
             # 解析完成后打印一下book_name和耗时

+ 3 - 2
magic_pdf/train_utils/convert_to_train_format.py

@@ -1,8 +1,8 @@
-
-
 def convert_to_train_format(jso: dict) -> []:
     pages = []
     for k, v in jso.items():
+        if not k.startswith("page_"):
+            continue
         page_idx = v["page_idx"]
         width, height = v["page_size"]
 
@@ -47,6 +47,7 @@ def convert_to_train_format(jso: dict) -> []:
             bboxes.append(n_bbox)
 
         info["bboxes"] = bboxes
+        info["layout_tree"] = v["layout_bboxes"]
         pages.append(info)
 
     return pages