self_modify.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. import time
  2. import copy
  3. import base64
  4. import cv2
  5. import numpy as np
  6. from io import BytesIO
  7. from PIL import Image
  8. from paddleocr import PaddleOCR
  9. from paddleocr.ppocr.utils.logging import get_logger
  10. from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
  11. from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
  12. logger = get_logger()
  13. def img_decode(content: bytes):
  14. np_arr = np.frombuffer(content, dtype=np.uint8)
  15. return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
  16. def check_img(img):
  17. if isinstance(img, bytes):
  18. img = img_decode(img)
  19. if isinstance(img, str):
  20. image_file = img
  21. img, flag_gif, flag_pdf = check_and_read(image_file)
  22. if not flag_gif and not flag_pdf:
  23. with open(image_file, 'rb') as f:
  24. img_str = f.read()
  25. img = img_decode(img_str)
  26. if img is None:
  27. try:
  28. buf = BytesIO()
  29. image = BytesIO(img_str)
  30. im = Image.open(image)
  31. rgb = im.convert('RGB')
  32. rgb.save(buf, 'jpeg')
  33. buf.seek(0)
  34. image_bytes = buf.read()
  35. data_base64 = str(base64.b64encode(image_bytes),
  36. encoding="utf-8")
  37. image_decode = base64.b64decode(data_base64)
  38. img_array = np.frombuffer(image_decode, np.uint8)
  39. img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
  40. except:
  41. logger.error("error in loading image:{}".format(image_file))
  42. return None
  43. if img is None:
  44. logger.error("error in loading image:{}".format(image_file))
  45. return None
  46. if isinstance(img, np.ndarray) and len(img.shape) == 2:
  47. img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
  48. return img
  49. def sorted_boxes(dt_boxes):
  50. """
  51. Sort text boxes in order from top to bottom, left to right
  52. args:
  53. dt_boxes(array):detected text boxes with shape [4, 2]
  54. return:
  55. sorted boxes(array) with shape [4, 2]
  56. """
  57. num_boxes = dt_boxes.shape[0]
  58. sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
  59. _boxes = list(sorted_boxes)
  60. for i in range(num_boxes - 1):
  61. for j in range(i, -1, -1):
  62. if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
  63. (_boxes[j + 1][0][0] < _boxes[j][0][0]):
  64. tmp = _boxes[j]
  65. _boxes[j] = _boxes[j + 1]
  66. _boxes[j + 1] = tmp
  67. else:
  68. break
  69. return _boxes
  70. def formula_in_text(mf_bbox, text_bbox):
  71. x1, y1, x2, y2 = mf_bbox
  72. x3, y3 = text_bbox[0]
  73. x4, y4 = text_bbox[2]
  74. left_box, right_box = None, None
  75. same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
  76. if not same_line:
  77. return False, left_box, right_box
  78. else:
  79. drop_origin = False
  80. left_x = x1 - 1
  81. right_x = x2 + 1
  82. if x3 < x1 and x2 < x4:
  83. drop_origin = True
  84. left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
  85. right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
  86. if x3 < x1 and x1 <= x4 <= x2:
  87. drop_origin = True
  88. left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
  89. if x1 <= x3 <= x2 and x2 < x4:
  90. drop_origin = True
  91. right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
  92. if x1 <= x3 < x4 <= x2:
  93. drop_origin = True
  94. return drop_origin, left_box, right_box
  95. def update_det_boxes(dt_boxes, mfdetrec_res):
  96. new_dt_boxes = dt_boxes
  97. for mf_box in mfdetrec_res:
  98. flag, left_box, right_box = False, None, None
  99. for idx, text_box in enumerate(new_dt_boxes):
  100. ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
  101. if ret:
  102. new_dt_boxes.pop(idx)
  103. if left_box is not None:
  104. new_dt_boxes.append(left_box)
  105. if right_box is not None:
  106. new_dt_boxes.append(right_box)
  107. break
  108. return new_dt_boxes
  109. class ModifiedPaddleOCR(PaddleOCR):
  110. def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
  111. """
  112. OCR with PaddleOCR
  113. args:
  114. img: img for OCR, support ndarray, img_path and list or ndarray
  115. det: use text detection or not. If False, only rec will be exec. Default is True
  116. rec: use text recognition or not. If False, only det will be exec. Default is True
  117. cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
  118. bin: binarize image to black and white. Default is False.
  119. inv: invert image colors. Default is False.
  120. alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
  121. """
  122. assert isinstance(img, (np.ndarray, list, str, bytes))
  123. if isinstance(img, list) and det == True:
  124. logger.error('When input a list of images, det must be false')
  125. exit(0)
  126. if cls == True and self.use_angle_cls == False:
  127. logger.warning(
  128. 'Since the angle classifier is not initialized, it will not be used during the forward process'
  129. )
  130. img = check_img(img)
  131. # for infer pdf file
  132. if isinstance(img, list):
  133. if self.page_num > len(img) or self.page_num == 0:
  134. self.page_num = len(img)
  135. imgs = img[:self.page_num]
  136. else:
  137. imgs = [img]
  138. def preprocess_image(_image):
  139. _image = alpha_to_color(_image, alpha_color)
  140. if inv:
  141. _image = cv2.bitwise_not(_image)
  142. if bin:
  143. _image = binarize_img(_image)
  144. return _image
  145. if det and rec:
  146. ocr_res = []
  147. for idx, img in enumerate(imgs):
  148. img = preprocess_image(img)
  149. dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
  150. if not dt_boxes and not rec_res:
  151. ocr_res.append(None)
  152. continue
  153. tmp_res = [[box.tolist(), res]
  154. for box, res in zip(dt_boxes, rec_res)]
  155. ocr_res.append(tmp_res)
  156. return ocr_res
  157. elif det and not rec:
  158. ocr_res = []
  159. for idx, img in enumerate(imgs):
  160. img = preprocess_image(img)
  161. dt_boxes, elapse = self.text_detector(img)
  162. if not dt_boxes:
  163. ocr_res.append(None)
  164. continue
  165. tmp_res = [box.tolist() for box in dt_boxes]
  166. ocr_res.append(tmp_res)
  167. return ocr_res
  168. else:
  169. ocr_res = []
  170. cls_res = []
  171. for idx, img in enumerate(imgs):
  172. if not isinstance(img, list):
  173. img = preprocess_image(img)
  174. img = [img]
  175. if self.use_angle_cls and cls:
  176. img, cls_res_tmp, elapse = self.text_classifier(img)
  177. if not rec:
  178. cls_res.append(cls_res_tmp)
  179. rec_res, elapse = self.text_recognizer(img)
  180. ocr_res.append(rec_res)
  181. if not rec:
  182. return cls_res
  183. return ocr_res
  184. def __call__(self, img, cls=True, mfd_res=None):
  185. time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
  186. if img is None:
  187. logger.debug("no valid image provided")
  188. return None, None, time_dict
  189. start = time.time()
  190. ori_im = img.copy()
  191. dt_boxes, elapse = self.text_detector(img)
  192. time_dict['det'] = elapse
  193. if dt_boxes is None:
  194. logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
  195. end = time.time()
  196. time_dict['all'] = end - start
  197. return None, None, time_dict
  198. else:
  199. logger.debug("dt_boxes num : {}, elapsed : {}".format(
  200. len(dt_boxes), elapse))
  201. img_crop_list = []
  202. dt_boxes = sorted_boxes(dt_boxes)
  203. if mfd_res:
  204. bef = time.time()
  205. dt_boxes = update_det_boxes(dt_boxes, mfd_res)
  206. aft = time.time()
  207. logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
  208. len(dt_boxes), aft-bef))
  209. for bno in range(len(dt_boxes)):
  210. tmp_box = copy.deepcopy(dt_boxes[bno])
  211. if self.args.det_box_type == "quad":
  212. img_crop = get_rotate_crop_image(ori_im, tmp_box)
  213. else:
  214. img_crop = get_minarea_rect_crop(ori_im, tmp_box)
  215. img_crop_list.append(img_crop)
  216. if self.use_angle_cls and cls:
  217. img_crop_list, angle_list, elapse = self.text_classifier(
  218. img_crop_list)
  219. time_dict['cls'] = elapse
  220. logger.debug("cls num : {}, elapsed : {}".format(
  221. len(img_crop_list), elapse))
  222. rec_res, elapse = self.text_recognizer(img_crop_list)
  223. time_dict['rec'] = elapse
  224. logger.debug("rec_res num : {}, elapsed : {}".format(
  225. len(rec_res), elapse))
  226. if self.args.save_crop_res:
  227. self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
  228. rec_res)
  229. filter_boxes, filter_rec_res = [], []
  230. for box, rec_result in zip(dt_boxes, rec_res):
  231. text, score = rec_result
  232. if score >= self.drop_score:
  233. filter_boxes.append(box)
  234. filter_rec_res.append(rec_result)
  235. end = time.time()
  236. time_dict['all'] = end - start
  237. return filter_boxes, filter_rec_res, time_dict