Explorar o código

Merge remote-tracking branch 'origin/master'

赵小蒙 hai 1 ano
pai
achega
59bc15e004

+ 36 - 0
.gitignore

@@ -0,0 +1,36 @@
+*.tar
+*.tar.gz
+venv*/
+envs/
+slurm_logs/
+
+sync1.sh
+data_preprocess_pj1
+data-preparation1
+__pycache__
+*.log
+*.pyc
+.vscode
+debug/
+*.ipynb
+.idea
+spark/__init__.py
+
+# vscode history
+.history
+
+.DS_Store
+.env
+
+bad_words/
+bak/
+
+app/tests/*
+temp/
+tmp/
+tmp
+.vscode
+.vscode/
+/test/
+
+/app/pdf_toolbox/test/test_bookname.txt

+ 1 - 2
pipeline/pdf_parse_by_model.py

@@ -271,9 +271,8 @@ def parse_pdf_by_model(
         """"以下进入到公式替换环节 """
         char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)['blocks']
         remain_text_blocks = combine_chars_to_pymudict(remain_text_blocks, char_level_text_blocks)# 合并chars
-        remain_text_blocks = remove_citation_marker(remain_text_blocks) # 先把角标去掉
-        
         remain_text_blocks = replace_equations_in_textblock(remain_text_blocks, inline_eq_info, interline_eq_info)
+        remain_text_blocks = remove_citation_marker(remain_text_blocks) # 公式替换之后去角标,防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。
         remain_text_blocks = remove_chars_in_text_blocks(remain_text_blocks) # 减少中间态数据体积
         #debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3)
 

+ 8 - 1
pre_proc/citationmarker_remove.py

@@ -114,12 +114,16 @@ def remove_citation_marker(with_char_text_blcoks):
 
             # 找到高度最高的span作为位置比较的基准
             max_hi_span = line['spans'][0]['bbox']
-            min_font_sz = 10000
+            min_font_sz = 10000 # line里最小的字体
+            max_font_sz = 0   # line里最大的字体
+                
             for s in line['spans']:
                 if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
                     max_hi_span = s['bbox']
                 if min_font_sz>s['size']:
                     min_font_sz = s['size']
+                if max_font_sz<s['size']:
+                    max_font_sz = s['size']
                         
             base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
             
@@ -130,6 +134,9 @@ def remove_citation_marker(with_char_text_blcoks):
                 span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
                 span_font_sz = span['size']
                 
+                if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
+                    continue
+                
                 if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
                     """
                     1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式

+ 29 - 2
pre_proc/detect_equation.py

@@ -1,12 +1,38 @@
 import os                   
 import collections      # 统计库
-import re               # 正则
+import re
+from libs.boxbase import _is_in               # 正则
 from libs.commons import fitz             # pyMuPDF库
 import json             # json
 from pathlib import Path
 
 
 
+def __solve_contain_bboxs(all_bbox_list: list):
+
+    """将两个公式的bbox做判断是否有包含关系,若有的话则删掉较小的bbox"""
+
+    dump_list = []
+    for i in range(len(all_bbox_list)):
+        for j in range(i + 1, len(all_bbox_list)):
+            # 获取当前两个值
+            bbox1 = all_bbox_list[i][:4]
+            bbox2 = all_bbox_list[j][:4]
+            
+            # 删掉较小的框
+            if _is_in(bbox1, bbox2):
+                dump_list.append(all_bbox_list[i])
+            elif _is_in(bbox2, bbox1):
+                dump_list.append(all_bbox_list[j])
+    
+    # 遍历需要删除的列表中的每个元素
+    for item in dump_list:
+        
+        while item in all_bbox_list:
+            all_bbox_list.remove(item)
+    return all_bbox_list
+
+
 def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     """
     :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
@@ -101,4 +127,5 @@ def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict
         for eq_box in equationIsolated_from_DocXChain_bboxs:
             eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
         
-    return equationEmbedding_from_DocXChain_bboxs, equationIsolated_from_DocXChain_bboxs
+    deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs)
+    return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs