block_sort.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import copy
  3. import os
  4. import statistics
  5. import warnings
  6. from typing import List
  7. import torch
  8. from loguru import logger
  9. from mineru.backend.pipeline.config_reader import get_device, get_local_layoutreader_model_dir
  10. from mineru.utils.enum_class import BlockType
  11. def sort_blocks_by_bbox(blocks, page_w, page_h, footnote_blocks):
  12. """获取所有line并计算正文line的高度"""
  13. line_height = get_line_height(blocks)
  14. """获取所有line并对line排序"""
  15. sorted_bboxes = sort_lines_by_model(blocks, page_w, page_h, line_height, footnote_blocks)
  16. """根据line的中位数算block的序列关系"""
  17. blocks = cal_block_index(blocks, sorted_bboxes)
  18. """将image和table的block还原回group形式参与后续流程"""
  19. blocks = revert_group_blocks(blocks)
  20. """重排block"""
  21. sorted_blocks = sorted(blocks, key=lambda b: b['index'])
  22. """block内重排(img和table的block内多个caption或footnote的排序)"""
  23. for block in sorted_blocks:
  24. if block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
  25. block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
  26. return sorted_blocks
  27. def get_line_height(blocks):
  28. page_line_height_list = []
  29. for block in blocks:
  30. if block['type'] in [
  31. BlockType.TEXT, BlockType.TITLE,
  32. BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
  33. BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
  34. ]:
  35. for line in block['lines']:
  36. bbox = line['bbox']
  37. page_line_height_list.append(int(bbox[3] - bbox[1]))
  38. if len(page_line_height_list) > 0:
  39. return statistics.median(page_line_height_list)
  40. else:
  41. return 10
  42. def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
  43. page_line_list = []
  44. def add_lines_to_block(b):
  45. line_bboxes = insert_lines_into_block(b['bbox'], line_height, page_w, page_h)
  46. b['lines'] = []
  47. for line_bbox in line_bboxes:
  48. b['lines'].append({'bbox': line_bbox, 'spans': []})
  49. page_line_list.extend(line_bboxes)
  50. for block in fix_blocks:
  51. if block['type'] in [
  52. BlockType.TEXT, BlockType.TITLE,
  53. BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
  54. BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
  55. ]:
  56. if len(block['lines']) == 0:
  57. add_lines_to_block(block)
  58. elif block['type'] in [BlockType.TITLE] and len(block['lines']) == 1 and (block['bbox'][3] - block['bbox'][1]) > line_height * 2:
  59. block['real_lines'] = copy.deepcopy(block['lines'])
  60. add_lines_to_block(block)
  61. else:
  62. for line in block['lines']:
  63. bbox = line['bbox']
  64. page_line_list.append(bbox)
  65. elif block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
  66. block['real_lines'] = copy.deepcopy(block['lines'])
  67. add_lines_to_block(block)
  68. for block in footnote_blocks:
  69. footnote_block = {'bbox': block[:4]}
  70. add_lines_to_block(footnote_block)
  71. if len(page_line_list) > 200: # layoutreader最高支持512line
  72. return None
  73. # 使用layoutreader排序
  74. x_scale = 1000.0 / page_w
  75. y_scale = 1000.0 / page_h
  76. boxes = []
  77. # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
  78. for left, top, right, bottom in page_line_list:
  79. if left < 0:
  80. logger.warning(
  81. f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  82. ) # noqa: E501
  83. left = 0
  84. if right > page_w:
  85. logger.warning(
  86. f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  87. ) # noqa: E501
  88. right = page_w
  89. if top < 0:
  90. logger.warning(
  91. f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  92. ) # noqa: E501
  93. top = 0
  94. if bottom > page_h:
  95. logger.warning(
  96. f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  97. ) # noqa: E501
  98. bottom = page_h
  99. left = round(left * x_scale)
  100. top = round(top * y_scale)
  101. right = round(right * x_scale)
  102. bottom = round(bottom * y_scale)
  103. assert (
  104. 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
  105. ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121
  106. boxes.append([left, top, right, bottom])
  107. model_manager = ModelSingleton()
  108. model = model_manager.get_model('layoutreader')
  109. with torch.no_grad():
  110. orders = do_predict(boxes, model)
  111. sorted_bboxes = [page_line_list[i] for i in orders]
  112. return sorted_bboxes
  113. def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
  114. # block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
  115. x0, y0, x1, y1 = block_bbox
  116. block_height = y1 - y0
  117. block_weight = x1 - x0
  118. # 如果block高度小于n行正文,则直接返回block的bbox
  119. if line_height * 2 < block_height:
  120. if (
  121. block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
  122. ): # 可能是双列结构,可以切细点
  123. lines = int(block_height / line_height)
  124. else:
  125. # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
  126. if block_weight > page_w * 0.4:
  127. lines = 3
  128. elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
  129. lines = int(block_height / line_height)
  130. else: # 判断长宽比
  131. if block_height / block_weight > 1.2: # 细长的不分
  132. return [[x0, y0, x1, y1]]
  133. else: # 不细长的还是分成两行
  134. lines = 2
  135. line_height = (y1 - y0) / lines
  136. # 确定从哪个y位置开始绘制线条
  137. current_y = y0
  138. # 用于存储线条的位置信息[(x0, y), ...]
  139. lines_positions = []
  140. for i in range(lines):
  141. lines_positions.append([x0, current_y, x1, current_y + line_height])
  142. current_y += line_height
  143. return lines_positions
  144. else:
  145. return [[x0, y0, x1, y1]]
  146. def model_init(model_name: str):
  147. from transformers import LayoutLMv3ForTokenClassification
  148. device_name = get_device()
  149. bf_16_support = False
  150. if device_name.startswith("cuda"):
  151. bf_16_support = torch.cuda.is_bf16_supported()
  152. elif device_name.startswith("mps"):
  153. bf_16_support = True
  154. device = torch.device(device_name)
  155. if model_name == 'layoutreader':
  156. # 检测modelscope的缓存目录是否存在
  157. layoutreader_model_dir = get_local_layoutreader_model_dir()
  158. if os.path.exists(layoutreader_model_dir):
  159. model = LayoutLMv3ForTokenClassification.from_pretrained(
  160. layoutreader_model_dir
  161. )
  162. else:
  163. logger.warning(
  164. 'local layoutreader model not exists, use online model from huggingface'
  165. )
  166. model = LayoutLMv3ForTokenClassification.from_pretrained(
  167. 'hantian/layoutreader'
  168. )
  169. if bf_16_support:
  170. model.to(device).eval().bfloat16()
  171. else:
  172. model.to(device).eval()
  173. else:
  174. logger.error('model name not allow')
  175. exit(1)
  176. return model
  177. class ModelSingleton:
  178. _instance = None
  179. _models = {}
  180. def __new__(cls, *args, **kwargs):
  181. if cls._instance is None:
  182. cls._instance = super().__new__(cls)
  183. return cls._instance
  184. def get_model(self, model_name: str):
  185. if model_name not in self._models:
  186. self._models[model_name] = model_init(model_name=model_name)
  187. return self._models[model_name]
  188. def do_predict(boxes: List[List[int]], model) -> List[int]:
  189. from mineru.model.reading_order.layout_reader import (
  190. boxes2inputs, parse_logits, prepare_inputs)
  191. with warnings.catch_warnings():
  192. warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
  193. inputs = boxes2inputs(boxes)
  194. inputs = prepare_inputs(inputs, model)
  195. logits = model(**inputs).logits.cpu().squeeze(0)
  196. return parse_logits(logits, len(boxes))
  197. def cal_block_index(fix_blocks, sorted_bboxes):
  198. if sorted_bboxes is not None:
  199. # 使用layoutreader排序
  200. for block in fix_blocks:
  201. line_index_list = []
  202. if len(block['lines']) == 0:
  203. block['index'] = sorted_bboxes.index(block['bbox'])
  204. else:
  205. for line in block['lines']:
  206. line['index'] = sorted_bboxes.index(line['bbox'])
  207. line_index_list.append(line['index'])
  208. median_value = statistics.median(line_index_list)
  209. block['index'] = median_value
  210. # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
  211. if block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
  212. if 'real_lines' in block:
  213. block['virtual_lines'] = copy.deepcopy(block['lines'])
  214. block['lines'] = copy.deepcopy(block['real_lines'])
  215. del block['real_lines']
  216. else:
  217. # 使用xycut排序
  218. block_bboxes = []
  219. for block in fix_blocks:
  220. # 如果block['bbox']任意值小于0,将其置为0
  221. block['bbox'] = [max(0, x) for x in block['bbox']]
  222. block_bboxes.append(block['bbox'])
  223. # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
  224. if block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
  225. if 'real_lines' in block:
  226. block['virtual_lines'] = copy.deepcopy(block['lines'])
  227. block['lines'] = copy.deepcopy(block['real_lines'])
  228. del block['real_lines']
  229. import numpy as np
  230. from mineru.model.reading_order.xycut import recursive_xy_cut
  231. random_boxes = np.array(block_bboxes)
  232. np.random.shuffle(random_boxes)
  233. res = []
  234. recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
  235. assert len(res) == len(block_bboxes)
  236. sorted_boxes = random_boxes[np.array(res)].tolist()
  237. for i, block in enumerate(fix_blocks):
  238. block['index'] = sorted_boxes.index(block['bbox'])
  239. # 生成line index
  240. sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
  241. line_inedx = 1
  242. for block in sorted_blocks:
  243. for line in block['lines']:
  244. line['index'] = line_inedx
  245. line_inedx += 1
  246. return fix_blocks
  247. def revert_group_blocks(blocks):
  248. image_groups = {}
  249. table_groups = {}
  250. new_blocks = []
  251. for block in blocks:
  252. if block['type'] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
  253. group_id = block['group_id']
  254. if group_id not in image_groups:
  255. image_groups[group_id] = []
  256. image_groups[group_id].append(block)
  257. elif block['type'] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
  258. group_id = block['group_id']
  259. if group_id not in table_groups:
  260. table_groups[group_id] = []
  261. table_groups[group_id].append(block)
  262. else:
  263. new_blocks.append(block)
  264. for group_id, blocks in image_groups.items():
  265. new_blocks.append(process_block_list(blocks, BlockType.IMAGE_BODY, BlockType.IMAGE))
  266. for group_id, blocks in table_groups.items():
  267. new_blocks.append(process_block_list(blocks, BlockType.TABLE_BODY, BlockType.TABLE))
  268. return new_blocks
  269. def process_block_list(blocks, body_type, block_type):
  270. indices = [block['index'] for block in blocks]
  271. median_index = statistics.median(indices)
  272. body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
  273. return {
  274. 'type': block_type,
  275. 'bbox': body_bbox,
  276. 'blocks': blocks,
  277. 'index': median_index,
  278. }