ocr_utils.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. import cv2
  2. import numpy as np
  3. from loguru import logger
  4. from io import BytesIO
  5. from PIL import Image
  6. import base64
  7. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
  8. from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
  9. from ppocr.utils.utility import check_and_read
  10. def img_decode(content: bytes):
  11. np_arr = np.frombuffer(content, dtype=np.uint8)
  12. return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
  13. def check_img(img):
  14. if isinstance(img, bytes):
  15. img = img_decode(img)
  16. if isinstance(img, str):
  17. image_file = img
  18. img, flag_gif, flag_pdf = check_and_read(image_file)
  19. if not flag_gif and not flag_pdf:
  20. with open(image_file, 'rb') as f:
  21. img_str = f.read()
  22. img = img_decode(img_str)
  23. if img is None:
  24. try:
  25. buf = BytesIO()
  26. image = BytesIO(img_str)
  27. im = Image.open(image)
  28. rgb = im.convert('RGB')
  29. rgb.save(buf, 'jpeg')
  30. buf.seek(0)
  31. image_bytes = buf.read()
  32. data_base64 = str(base64.b64encode(image_bytes),
  33. encoding="utf-8")
  34. image_decode = base64.b64decode(data_base64)
  35. img_array = np.frombuffer(image_decode, np.uint8)
  36. img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
  37. except:
  38. logger.error("error in loading image:{}".format(image_file))
  39. return None
  40. if img is None:
  41. logger.error("error in loading image:{}".format(image_file))
  42. return None
  43. if isinstance(img, np.ndarray) and len(img.shape) == 2:
  44. img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
  45. return img
  46. def bbox_to_points(bbox):
  47. """ 将bbox格式转换为四个顶点的数组 """
  48. x0, y0, x1, y1 = bbox
  49. return np.array([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]).astype('float32')
  50. def points_to_bbox(points):
  51. """ 将四个顶点的数组转换为bbox格式 """
  52. x0, y0 = points[0]
  53. x1, _ = points[1]
  54. _, y1 = points[2]
  55. return [x0, y0, x1, y1]
  56. def merge_intervals(intervals):
  57. # Sort the intervals based on the start value
  58. intervals.sort(key=lambda x: x[0])
  59. merged = []
  60. for interval in intervals:
  61. # If the list of merged intervals is empty or if the current
  62. # interval does not overlap with the previous, simply append it.
  63. if not merged or merged[-1][1] < interval[0]:
  64. merged.append(interval)
  65. else:
  66. # Otherwise, there is overlap, so we merge the current and previous intervals.
  67. merged[-1][1] = max(merged[-1][1], interval[1])
  68. return merged
  69. def remove_intervals(original, masks):
  70. # Merge all mask intervals
  71. merged_masks = merge_intervals(masks)
  72. result = []
  73. original_start, original_end = original
  74. for mask in merged_masks:
  75. mask_start, mask_end = mask
  76. # If the mask starts after the original range, ignore it
  77. if mask_start > original_end:
  78. continue
  79. # If the mask ends before the original range starts, ignore it
  80. if mask_end < original_start:
  81. continue
  82. # Remove the masked part from the original range
  83. if original_start < mask_start:
  84. result.append([original_start, mask_start - 1])
  85. original_start = max(mask_end + 1, original_start)
  86. # Add the remaining part of the original range, if any
  87. if original_start <= original_end:
  88. result.append([original_start, original_end])
  89. return result
  90. def update_det_boxes(dt_boxes, mfd_res):
  91. new_dt_boxes = []
  92. angle_boxes_list = []
  93. for text_box in dt_boxes:
  94. if calculate_is_angle(text_box):
  95. angle_boxes_list.append(text_box)
  96. continue
  97. text_bbox = points_to_bbox(text_box)
  98. masks_list = []
  99. for mf_box in mfd_res:
  100. mf_bbox = mf_box['bbox']
  101. if __is_overlaps_y_exceeds_threshold(text_bbox, mf_bbox):
  102. masks_list.append([mf_bbox[0], mf_bbox[2]])
  103. text_x_range = [text_bbox[0], text_bbox[2]]
  104. text_remove_mask_range = remove_intervals(text_x_range, masks_list)
  105. temp_dt_box = []
  106. for text_remove_mask in text_remove_mask_range:
  107. temp_dt_box.append(bbox_to_points([text_remove_mask[0], text_bbox[1], text_remove_mask[1], text_bbox[3]]))
  108. if len(temp_dt_box) > 0:
  109. new_dt_boxes.extend(temp_dt_box)
  110. new_dt_boxes.extend(angle_boxes_list)
  111. return new_dt_boxes
  112. def merge_overlapping_spans(spans):
  113. """
  114. Merges overlapping spans on the same line.
  115. :param spans: A list of span coordinates [(x1, y1, x2, y2), ...]
  116. :return: A list of merged spans
  117. """
  118. # Return an empty list if the input spans list is empty
  119. if not spans:
  120. return []
  121. # Sort spans by their starting x-coordinate
  122. spans.sort(key=lambda x: x[0])
  123. # Initialize the list of merged spans
  124. merged = []
  125. for span in spans:
  126. # Unpack span coordinates
  127. x1, y1, x2, y2 = span
  128. # If the merged list is empty or there's no horizontal overlap, add the span directly
  129. if not merged or merged[-1][2] < x1:
  130. merged.append(span)
  131. else:
  132. # If there is horizontal overlap, merge the current span with the previous one
  133. last_span = merged.pop()
  134. # Update the merged span's top-left corner to the smaller (x1, y1) and bottom-right to the larger (x2, y2)
  135. x1 = min(last_span[0], x1)
  136. y1 = min(last_span[1], y1)
  137. x2 = max(last_span[2], x2)
  138. y2 = max(last_span[3], y2)
  139. # Add the merged span back to the list
  140. merged.append((x1, y1, x2, y2))
  141. # Return the list of merged spans
  142. return merged
  143. def merge_det_boxes(dt_boxes):
  144. """
  145. Merge detection boxes.
  146. This function takes a list of detected bounding boxes, each represented by four corner points.
  147. The goal is to merge these bounding boxes into larger text regions.
  148. Parameters:
  149. dt_boxes (list): A list containing multiple text detection boxes, where each box is defined by four corner points.
  150. Returns:
  151. list: A list containing the merged text regions, where each region is represented by four corner points.
  152. """
  153. # Convert the detection boxes into a dictionary format with bounding boxes and type
  154. dt_boxes_dict_list = []
  155. angle_boxes_list = []
  156. for text_box in dt_boxes:
  157. text_bbox = points_to_bbox(text_box)
  158. if calculate_is_angle(text_box):
  159. angle_boxes_list.append(text_box)
  160. continue
  161. text_box_dict = {
  162. 'bbox': text_bbox,
  163. 'type': 'text',
  164. }
  165. dt_boxes_dict_list.append(text_box_dict)
  166. # Merge adjacent text regions into lines
  167. lines = merge_spans_to_line(dt_boxes_dict_list)
  168. # Initialize a new list for storing the merged text regions
  169. new_dt_boxes = []
  170. for line in lines:
  171. line_bbox_list = []
  172. for span in line:
  173. line_bbox_list.append(span['bbox'])
  174. # Merge overlapping text regions within the same line
  175. merged_spans = merge_overlapping_spans(line_bbox_list)
  176. # Convert the merged text regions back to point format and add them to the new detection box list
  177. for span in merged_spans:
  178. new_dt_boxes.append(bbox_to_points(span))
  179. new_dt_boxes.extend(angle_boxes_list)
  180. return new_dt_boxes
  181. def get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list):
  182. paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
  183. # Adjust the coordinates of the formula area
  184. adjusted_mfdetrec_res = []
  185. for mf_res in single_page_mfdetrec_res:
  186. mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"]
  187. # Adjust the coordinates of the formula area to the coordinates relative to the cropping area
  188. x0 = mf_xmin - xmin + paste_x
  189. y0 = mf_ymin - ymin + paste_y
  190. x1 = mf_xmax - xmin + paste_x
  191. y1 = mf_ymax - ymin + paste_y
  192. # Filter formula blocks outside the graph
  193. if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]):
  194. continue
  195. else:
  196. adjusted_mfdetrec_res.append({
  197. "bbox": [x0, y0, x1, y1],
  198. })
  199. return adjusted_mfdetrec_res
  200. def get_ocr_result_list(ocr_res, useful_list):
  201. paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
  202. ocr_result_list = []
  203. for box_ocr_res in ocr_res:
  204. if len(box_ocr_res) == 2:
  205. p1, p2, p3, p4 = box_ocr_res[0]
  206. text, score = box_ocr_res[1]
  207. # logger.info(f"text: {text}, score: {score}")
  208. if score < 0.6: # 过滤低置信度的结果
  209. continue
  210. else:
  211. p1, p2, p3, p4 = box_ocr_res
  212. text, score = "", 1
  213. # average_angle_degrees = calculate_angle_degrees(box_ocr_res[0])
  214. # if average_angle_degrees > 0.5:
  215. poly = [p1, p2, p3, p4]
  216. if calculate_is_angle(poly):
  217. # logger.info(f"average_angle_degrees: {average_angle_degrees}, text: {text}")
  218. # 与x轴的夹角超过0.5度,对边界做一下矫正
  219. # 计算几何中心
  220. x_center = sum(point[0] for point in poly) / 4
  221. y_center = sum(point[1] for point in poly) / 4
  222. new_height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2
  223. new_width = p3[0] - p1[0]
  224. p1 = [x_center - new_width / 2, y_center - new_height / 2]
  225. p2 = [x_center + new_width / 2, y_center - new_height / 2]
  226. p3 = [x_center + new_width / 2, y_center + new_height / 2]
  227. p4 = [x_center - new_width / 2, y_center + new_height / 2]
  228. # Convert the coordinates back to the original coordinate system
  229. p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin]
  230. p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin]
  231. p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin]
  232. p4 = [p4[0] - paste_x + xmin, p4[1] - paste_y + ymin]
  233. ocr_result_list.append({
  234. 'category_id': 15,
  235. 'poly': p1 + p2 + p3 + p4,
  236. 'score': float(round(score, 2)),
  237. 'text': text,
  238. })
  239. return ocr_result_list
  240. def calculate_is_angle(poly):
  241. p1, p2, p3, p4 = poly
  242. height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2
  243. if 0.8 * height <= (p3[1] - p1[1]) <= 1.2 * height:
  244. return False
  245. else:
  246. # logger.info((p3[1] - p1[1])/height)
  247. return True