check_inline_formula.py 4.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
  2. from magic_pdf.libs import fitz
  3. def check_inline_formula(page, inline_formula_boxes):
  4. """
  5. :param page :fitz读取的当前页的内容
  6. :param inline_formula_boxes: list类型,每一个元素是一个元祖 (L, U, R, D)
  7. :return: inline_formula_check: list类型,每一个元素是一个类别,其顺序对应输入的inline_formula_boxes,给每个行内公式打一个标签,包括:
  8. - nocheck_inline_formula:这个公式框没有与任何span相交,有可能存在问题
  9. - wrong_text_block:这个公式框同时存在多个block里,可能页面的text block存在问题
  10. - false_inline_formula:只涉及一个span并且只占据这个span的小部分面积,判断可能不是公式
  11. - true_inline_formula:两种情况判断为公式,一是横跨多个span,二是只涉及一个span但是几乎占据了这个span大部分的面积
  12. """
  13. # count = defaultdict(int)
  14. ## ------------------------ Text --------------------------------------------
  15. blocks = page.get_text(
  16. "dict",
  17. flags=fitz.TEXTFLAGS_TEXT,
  18. #clip=clip,
  19. )["blocks"]
  20. # iterate over the bboxes
  21. inline_formula_check = []
  22. for result in inline_formula_boxes:
  23. (x1, y1, x2, y2) = (result[0], result[1], result[2], result[3])
  24. ## 逐个block##
  25. in_block = 0
  26. for bbox in blocks:
  27. # image = cv2.rectangle(image, (int(bbox['bbox'][0]), int(bbox['bbox'][1])), (int(bbox['bbox'][2]), int(bbox['bbox'][3])), (0, 255, 0), 1)
  28. if (y1 >= bbox['bbox'][1] and y2 <= bbox['bbox'][3]) and (x1 >= bbox['bbox'][0] and x2 <= bbox['bbox'][2]): # 判定公式在哪一个block
  29. in_block += 1
  30. intersect = []
  31. # ## 逐个span###
  32. for line in bbox['lines']:
  33. if line['bbox'][1] <= ((y2 - y1) / 2) + y1 <= line['bbox'][3]: # 判断公式在哪一行
  34. for item in line['spans']:
  35. (t_x1, t_y1, t_x2, t_y2) = item['bbox']
  36. if not ((t_x1 < x1 and t_x2 < x1) or (t_x1 > x2 and t_x2 > x2) or (t_y1 < y1 and t_y2 < y1) or (t_y1 > y2 and t_y2 > y2)): # 判断是否相交
  37. intersect.append(item['bbox'])
  38. # image = cv2.rectangle(image, (int(t_x1), int(t_y1)), (int(t_x2), int(t_y2)), (0, 255, 0), 1) # 可视化涉及到的span
  39. # 可视化公式的分类
  40. if len(intersect) == 0: # 没有与任何一个span有相交,这个span或者这个inline_formula_box可能有问题
  41. # print(f'Wrong location, check {img_path}')
  42. inline_formula_check_result = "nocheck_inline_formula"
  43. # count['not_in_line'] += 1
  44. elif len(intersect) == 1:
  45. if abs((intersect[0][2] - intersect[0][0]) - (x2 - x1)) < (x2 - x1)*0.5: # 只涉及一个span但是几乎占据了这个span大部分的面积,判定为公式
  46. # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)
  47. inline_formula_check_result = "true_inline_formula"
  48. # count['one_span_large'] += 1
  49. else: # 只涉及一个span并且只占据这个span的小部分面积,判断可能不是公式
  50. # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 1)
  51. inline_formula_check_result = "false_inline_formula"
  52. # count['fail'] += 1
  53. else: # 横跨多个span,判定为公式
  54. # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 1)
  55. inline_formula_check_result = "true_inline_formula"
  56. # count['multi_span'] += 1
  57. if in_block == 0: # 这个公式没有在任何的block里,这个公式可能有问题
  58. # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 255, 0), 1)
  59. inline_formula_check_result = "nocheck_inline_formula"
  60. # count['not_in_block'] += 1
  61. elif in_block > 1: # 这个公式存在于多个block里,这个页面可能有问题
  62. inline_formula_check_result = "wrong_text_block"
  63. inline_formula_check.append(inline_formula_check_result)
  64. return inline_formula_check