commons.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. import sys
  2. from magic_pdf.libs.commons import fitz
  3. from termcolor import cprint
  4. if sys.version_info[0] >= 3:
  5. sys.stdout.reconfigure(encoding="utf-8") # type: ignore
  6. def open_pdf(pdf_path):
  7. try:
  8. pdf_document = fitz.open(pdf_path) # type: ignore
  9. return pdf_document
  10. except Exception as e:
  11. print(f"无法打开PDF文件:{pdf_path}。原因是:{e}")
  12. raise e
  13. def print_green_on_red(text):
  14. cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
  15. def print_green(text):
  16. print()
  17. cprint(text, "green", attrs=["bold"], end="\n\n")
  18. def print_red(text):
  19. print()
  20. cprint(text, "red", attrs=["bold"], end="\n\n")
  21. def print_yellow(text):
  22. print()
  23. cprint(text, "yellow", attrs=["bold"], end="\n\n")
  24. def safe_get(dict_obj, key, default):
  25. val = dict_obj.get(key)
  26. if val is None:
  27. return default
  28. else:
  29. return val
  30. def is_bbox_overlap(bbox1, bbox2):
  31. """
  32. This function checks if bbox1 and bbox2 overlap or not
  33. Parameters
  34. ----------
  35. bbox1 : list
  36. bbox1
  37. bbox2 : list
  38. bbox2
  39. Returns
  40. -------
  41. bool
  42. True if bbox1 and bbox2 overlap, else False
  43. """
  44. x0_1, y0_1, x1_1, y1_1 = bbox1
  45. x0_2, y0_2, x1_2, y1_2 = bbox2
  46. if x0_1 > x1_2 or x0_2 > x1_1:
  47. return False
  48. if y0_1 > y1_2 or y0_2 > y1_1:
  49. return False
  50. return True
  51. def is_in_bbox(bbox1, bbox2):
  52. """
  53. This function checks if bbox1 is in bbox2
  54. Parameters
  55. ----------
  56. bbox1 : list
  57. bbox1
  58. bbox2 : list
  59. bbox2
  60. Returns
  61. -------
  62. bool
  63. True if bbox1 is in bbox2, else False
  64. """
  65. x0_1, y0_1, x1_1, y1_1 = bbox1
  66. x0_2, y0_2, x1_2, y1_2 = bbox2
  67. if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
  68. return True
  69. else:
  70. return False
  71. def calculate_para_bbox(lines):
  72. """
  73. This function calculates the minimum bbox of the paragraph
  74. Parameters
  75. ----------
  76. lines : list
  77. lines
  78. Returns
  79. -------
  80. para_bbox : list
  81. bbox of the paragraph
  82. """
  83. x0 = min(line["bbox"][0] for line in lines)
  84. y0 = min(line["bbox"][1] for line in lines)
  85. x1 = max(line["bbox"][2] for line in lines)
  86. y1 = max(line["bbox"][3] for line in lines)
  87. return [x0, y0, x1, y1]
  88. def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
  89. """
  90. This function checks if the line is right aligned from its neighbors
  91. Parameters
  92. ----------
  93. curr_line_bbox : list
  94. bbox of the current line
  95. prev_line_bbox : list
  96. bbox of the previous line
  97. next_line_bbox : list
  98. bbox of the next line
  99. avg_char_width : float
  100. average of char widths
  101. direction : int
  102. 0 for prev, 1 for next, 2 for both
  103. Returns
  104. -------
  105. bool
  106. True if the line is right aligned from its neighbors, False otherwise.
  107. """
  108. horizontal_ratio = 0.5
  109. horizontal_thres = horizontal_ratio * avg_char_width
  110. _, _, x1, _ = curr_line_bbox
  111. _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
  112. _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
  113. if direction == 0:
  114. return abs(x1 - prev_x1) < horizontal_thres
  115. elif direction == 1:
  116. return abs(x1 - next_x1) < horizontal_thres
  117. elif direction == 2:
  118. return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
  119. else:
  120. return False
  121. def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
  122. """
  123. This function checks if the line is left aligned from its neighbors
  124. Parameters
  125. ----------
  126. curr_line_bbox : list
  127. bbox of the current line
  128. prev_line_bbox : list
  129. bbox of the previous line
  130. next_line_bbox : list
  131. bbox of the next line
  132. avg_char_width : float
  133. average of char widths
  134. direction : int
  135. 0 for prev, 1 for next, 2 for both
  136. Returns
  137. -------
  138. bool
  139. True if the line is left aligned from its neighbors, False otherwise.
  140. """
  141. horizontal_ratio = 0.5
  142. horizontal_thres = horizontal_ratio * avg_char_width
  143. x0, _, _, _ = curr_line_bbox
  144. prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
  145. next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
  146. if direction == 0:
  147. return abs(x0 - prev_x0) < horizontal_thres
  148. elif direction == 1:
  149. return abs(x0 - next_x0) < horizontal_thres
  150. elif direction == 2:
  151. return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
  152. else:
  153. return False
  154. def end_with_punctuation(line_text):
  155. """
  156. This function checks if the line ends with punctuation marks
  157. """
  158. english_end_puncs = [".", "?", "!"]
  159. chinese_end_puncs = ["。", "?", "!"]
  160. end_puncs = english_end_puncs + chinese_end_puncs
  161. last_non_space_char = None
  162. for ch in line_text[::-1]:
  163. if not ch.isspace():
  164. last_non_space_char = ch
  165. break
  166. if last_non_space_char is None:
  167. return False
  168. return last_non_space_char in end_puncs
  169. def is_nested_list(lst):
  170. if isinstance(lst, list):
  171. return any(isinstance(sub, list) for sub in lst)
  172. return False