Răsfoiți Sursa

Merge branch 'master' of github.com:papayalove/Magic-PDF

liukaiwen 1 an în urmă
părinte
comite
dbdbaf58be
3 a modificat fișierele cu 91 adăugiri și 71 ștergeri
  1. 68 65
      magic_pdf/pre_proc/citationmarker_remove.py
  2. 23 3
      magic_pdf/user_api.py
  3. 0 3
      requirements.txt

+ 68 - 65
magic_pdf/pre_proc/citationmarker_remove.py

@@ -3,10 +3,10 @@
 https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
 """
 import re
-from magic_pdf.libs.nlp_utils import NLPModels
+# from magic_pdf.libs.nlp_utils import NLPModels
 
 
-__NLP_MODEL = NLPModels()
+# __NLP_MODEL = NLPModels()
 
 def check_1(spans, cur_span_i):
     """寻找前一个char,如果是句号,逗号,那么就是角标"""
@@ -20,68 +20,68 @@ def check_1(spans, cur_span_i):
     return False
 
 
-def check_2(spans, cur_span_i):
-    """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
-    pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
-    
-    if cur_span_i==0 and len(spans)>1:
-        next_span = spans[cur_span_i+1]
-        next_txt = "".join([c['c'] for c in next_span['chars']])
-        result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
-        if result in ["PERSON", "GPE", "ORG"]:
-            return True
-
-        if re.findall(pattern, next_txt):
-            return True
-        
-        return False # 不是角标
-    elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
-        return False
-    
-    # 如果这个span是最后一个span,
-    if cur_span_i==len(spans)-1:
-        pre_span = spans[cur_span_i-1]
-        pre_txt = "".join([c['c'] for c in pre_span['chars']])
-        pre_word = pre_txt.split(' ')[-1]
-        result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
-        if result in ["PERSON", "GPE", "ORG"]:
-            return True
-        
-        if re.findall(pattern, pre_txt):
-            return True
-        
-        return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
-    else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
-        pre_span = spans[cur_span_i-1]
-        next_span = spans[cur_span_i+1]
-        cur_span = spans[cur_span_i]
-        # 找到前一个和后一个span里的距离最近的单词
-        pre_distance = 10000 # 一个很大的数
-        next_distance = 10000 # 一个很大的数
-        for c in pre_span['chars'][::-1]:
-            if c['c'].isalpha():
-                pre_distance = cur_span['bbox'][0] - c['bbox'][2]
-                break
-        for c in next_span['chars']:
-            if c['c'].isalpha():
-                next_distance = c['bbox'][0] - cur_span['bbox'][2]
-                break
-        
-        if pre_distance<next_distance:
-            belong_to_span = pre_span
-        else:
-            belong_to_span = next_span
-            
-        txt = "".join([c['c'] for c in belong_to_span['chars']])
-        pre_word = txt.split(' ')[-1]
-        result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
-        if result in ["PERSON", "GPE", "ORG"]:
-            return True
-        
-        if re.findall(pattern, txt):
-            return True
-    
-        return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+# def check_2(spans, cur_span_i):
+#     """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
+#     pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
+#
+#     if cur_span_i==0 and len(spans)>1:
+#         next_span = spans[cur_span_i+1]
+#         next_txt = "".join([c['c'] for c in next_span['chars']])
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, next_txt):
+#             return True
+#
+#         return False # 不是角标
+#     elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
+#         return False
+#
+#     # 如果这个span是最后一个span,
+#     if cur_span_i==len(spans)-1:
+#         pre_span = spans[cur_span_i-1]
+#         pre_txt = "".join([c['c'] for c in pre_span['chars']])
+#         pre_word = pre_txt.split(' ')[-1]
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, pre_txt):
+#             return True
+#
+#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+#     else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
+#         pre_span = spans[cur_span_i-1]
+#         next_span = spans[cur_span_i+1]
+#         cur_span = spans[cur_span_i]
+#         # 找到前一个和后一个span里的距离最近的单词
+#         pre_distance = 10000 # 一个很大的数
+#         next_distance = 10000 # 一个很大的数
+#         for c in pre_span['chars'][::-1]:
+#             if c['c'].isalpha():
+#                 pre_distance = cur_span['bbox'][0] - c['bbox'][2]
+#                 break
+#         for c in next_span['chars']:
+#             if c['c'].isalpha():
+#                 next_distance = c['bbox'][0] - cur_span['bbox'][2]
+#                 break
+#
+#         if pre_distance<next_distance:
+#             belong_to_span = pre_span
+#         else:
+#             belong_to_span = next_span
+#
+#         txt = "".join([c['c'] for c in belong_to_span['chars']])
+#         pre_word = txt.split(' ')[-1]
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, txt):
+#             return True
+#
+#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
 
 
 def check_3(spans, cur_span_i):
@@ -143,7 +143,10 @@ def remove_citation_marker(with_char_text_blcoks):
                     3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
                     4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
                     """
-                    if check_1(line['spans'], i) or check_2(line['spans'], i) or check_3(line['spans'], i):
+                    if (check_1(line['spans'], i) or
+                        # check_2(line['spans'], i) or
+                        check_3(line['spans'], i)
+                    ):
                         """删除掉这个角标:删除这个span, 同时还要更新line的text"""
                         span_to_del.append(span)
             if len(span_to_del)>0:

+ 23 - 3
magic_pdf/user_api.py

@@ -12,6 +12,8 @@
 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
 
 """
+import re
+
 from loguru import logger
 
 from magic_pdf.rw import AbsReaderWriter
@@ -78,9 +80,27 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
             return None
 
     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
-
-    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
-        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+    text_all = ""
+    for page_dict in pdf_info_dict['pdf_info']:
+        for para_block in page_dict['para_blocks']:
+            if para_block['type'] in ['title', 'text']:
+                for line in para_block['lines']:
+                    for span in line['spans']:
+                        text_all += span['content']
+
+    def calculate_garbled_rate(text):
+        garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
+        # 计算乱码字符的数量
+        garbage_count = len(garbage_regex.findall(text))
+        total = len(text)
+        if total == 0:
+            return 0  # 避免除以零的错误
+        return garbage_count / total
+
+    garbled_rate = calculate_garbled_rate(text_all)
+
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
+        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")

+ 0 - 3
requirements.txt

@@ -9,11 +9,8 @@ numpy>=1.21.6
 pandas>=1.3.5
 pycld2>=0.41
 regex>=2023.12.25
-spacy>=3.7.4
 termcolor>=2.4.0
 wordninja>=2.0.0
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
-zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
 scikit-learn>=1.0.2
 nltk==3.8.1
 s3pathlib>=2.1.1