Kaynağa Gözat

feat: process title and footnote

xu rui 1 yıl önce
ebeveyn
işleme
432e1ae5e3

+ 2 - 1
magic_pdf/pdf_parse_for_train.py

@@ -253,7 +253,8 @@ def parse_pdf_for_train(
         # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
         接下来开始进行预处理过程
         """
-
+        title_bboxs = parse_titles(page_id, page, model_output_json)
+        
         """去掉每页的页码、页眉、页脚"""
         page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
         header_bboxs = parse_headers(page_id, page, model_output_json)

+ 14 - 2
magic_pdf/train_utils/convert_to_train_format.py

@@ -35,8 +35,16 @@ def convert_to_train_format(jso: dict) -> []:
 
         # 脚注, 目前没有看到例子
         for para in v["para_blocks"]:
-            n_bbox = {"category_id": 2, "bbox": para["bbox"]}
-            bboxes.append(n_bbox)
+            if "paras" in para:
+                paras = para["paras"]
+                for para_key, para_content in paras.items():
+                    para_bbox = para_content["para_bbox"]
+                    is_para_title = para_content["is_para_title"]
+                    if is_para_title:
+                        n_bbox = {"category_id": 0, "bbox": para_bbox}
+                    else:
+                        n_bbox = {"category_id": 2, "bbox": para_bbox}
+                    bboxes.append(n_bbox)
 
         for inline_equation in v["inline_equations"]:
             n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
@@ -46,6 +54,10 @@ def convert_to_train_format(jso: dict) -> []:
             n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
             bboxes.append(n_bbox)
 
+        for footnote in v['footnote_bboxes_tmp']:
+            n_bbox = {"category_id": 5, "bbox": footnote["bbox"]}
+            bboxes.append(n_bbox)
+
         info["bboxes"] = bboxes
         info["layout_tree"] = v["layout_bboxes"]
         pages.append(info)