citationmarker_remove.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. """
  2. 去掉正文的引文引用marker
  3. https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
  4. """
  5. import re
  6. # from magic_pdf.libs.nlp_utils import NLPModels
  7. # __NLP_MODEL = NLPModels()
  8. def check_1(spans, cur_span_i):
  9. """寻找前一个char,如果是句号,逗号,那么就是角标"""
  10. if cur_span_i==0:
  11. return False # 不是角标
  12. pre_span = spans[cur_span_i-1]
  13. pre_char = pre_span['chars'][-1]['c']
  14. if pre_char in ['。', ',', '.', ',']:
  15. return True
  16. return False
  17. # def check_2(spans, cur_span_i):
  18. # """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
  19. # pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
  20. #
  21. # if cur_span_i==0 and len(spans)>1:
  22. # next_span = spans[cur_span_i+1]
  23. # next_txt = "".join([c['c'] for c in next_span['chars']])
  24. # result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
  25. # if result in ["PERSON", "GPE", "ORG"]:
  26. # return True
  27. #
  28. # if re.findall(pattern, next_txt):
  29. # return True
  30. #
  31. # return False # 不是角标
  32. # elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
  33. # return False
  34. #
  35. # # 如果这个span是最后一个span,
  36. # if cur_span_i==len(spans)-1:
  37. # pre_span = spans[cur_span_i-1]
  38. # pre_txt = "".join([c['c'] for c in pre_span['chars']])
  39. # pre_word = pre_txt.split(' ')[-1]
  40. # result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
  41. # if result in ["PERSON", "GPE", "ORG"]:
  42. # return True
  43. #
  44. # if re.findall(pattern, pre_txt):
  45. # return True
  46. #
  47. # return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
  48. # else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
  49. # pre_span = spans[cur_span_i-1]
  50. # next_span = spans[cur_span_i+1]
  51. # cur_span = spans[cur_span_i]
  52. # # 找到前一个和后一个span里的距离最近的单词
  53. # pre_distance = 10000 # 一个很大的数
  54. # next_distance = 10000 # 一个很大的数
  55. # for c in pre_span['chars'][::-1]:
  56. # if c['c'].isalpha():
  57. # pre_distance = cur_span['bbox'][0] - c['bbox'][2]
  58. # break
  59. # for c in next_span['chars']:
  60. # if c['c'].isalpha():
  61. # next_distance = c['bbox'][0] - cur_span['bbox'][2]
  62. # break
  63. #
  64. # if pre_distance<next_distance:
  65. # belong_to_span = pre_span
  66. # else:
  67. # belong_to_span = next_span
  68. #
  69. # txt = "".join([c['c'] for c in belong_to_span['chars']])
  70. # pre_word = txt.split(' ')[-1]
  71. # result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
  72. # if result in ["PERSON", "GPE", "ORG"]:
  73. # return True
  74. #
  75. # if re.findall(pattern, txt):
  76. # return True
  77. #
  78. # return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
  79. def check_3(spans, cur_span_i):
  80. """上标里有[], 有*, 有-, 有逗号"""
  81. # 如[2-3],[22]
  82. # 如 2,3,4
  83. cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
  84. bad_char = ['[', ']', '*', ',']
  85. if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
  86. return True
  87. # 如2-3, a-b
  88. patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
  89. for pattern in patterns:
  90. match = re.match(pattern, cur_span_txt)
  91. if match is not None:
  92. return True
  93. return False
  94. def remove_citation_marker(with_char_text_blcoks):
  95. for blk in with_char_text_blcoks:
  96. for line in blk['lines']:
  97. # 如果span里的个数少于2个,那只能忽略,角标不可能自己独占一行
  98. if len(line['spans'])<=1:
  99. continue
  100. # 找到高度最高的span作为位置比较的基准
  101. max_hi_span = line['spans'][0]['bbox']
  102. min_font_sz = 10000 # line里最小的字体
  103. max_font_sz = 0 # line里最大的字体
  104. for s in line['spans']:
  105. if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
  106. max_hi_span = s['bbox']
  107. if min_font_sz>s['size']:
  108. min_font_sz = s['size']
  109. if max_font_sz<s['size']:
  110. max_font_sz = s['size']
  111. base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
  112. span_to_del = []
  113. for i, span in enumerate(line['spans']):
  114. span_hi = span['bbox'][3]-span['bbox'][1]
  115. span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
  116. span_font_sz = span['size']
  117. if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
  118. continue
  119. # 对被除数为0的情况进行过滤
  120. if span_hi==0 or min_font_sz==0:
  121. continue
  122. if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
  123. """
  124. 1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
  125. 2. 如果这个角标的前面是一个单词(长度大于5)而不是任何大写或小写的短字母的话 应该也是角标
  126. 3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
  127. 4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
  128. """
  129. if (check_1(line['spans'], i) or
  130. # check_2(line['spans'], i) or
  131. check_3(line['spans'], i)
  132. ):
  133. """删除掉这个角标:删除这个span, 同时还要更新line的text"""
  134. span_to_del.append(span)
  135. if len(span_to_del)>0:
  136. for span in span_to_del:
  137. line['spans'].remove(span)
  138. line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
  139. return with_char_text_blcoks