block_sort.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import copy
  3. import os
  4. import statistics
  5. import warnings
  6. from typing import List
  7. import torch
  8. from loguru import logger
  9. from mineru.utils.config_reader import get_device
  10. from mineru.utils.enum_class import BlockType, ModelPath
  11. from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
  12. def sort_blocks_by_bbox(blocks, page_w, page_h, footnote_blocks):
  13. """获取所有line并计算正文line的高度"""
  14. line_height = get_line_height(blocks)
  15. """获取所有line并对line排序"""
  16. sorted_bboxes = sort_lines_by_model(blocks, page_w, page_h, line_height, footnote_blocks)
  17. """根据line的中位数算block的序列关系"""
  18. blocks = cal_block_index(blocks, sorted_bboxes)
  19. """将image和table的block还原回group形式参与后续流程"""
  20. blocks = revert_group_blocks(blocks)
  21. """重排block"""
  22. sorted_blocks = sorted(blocks, key=lambda b: b['index'])
  23. """block内重排(img和table的block内多个caption或footnote的排序)"""
  24. for block in sorted_blocks:
  25. if block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
  26. block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
  27. return sorted_blocks
  28. def get_line_height(blocks):
  29. page_line_height_list = []
  30. for block in blocks:
  31. if block['type'] in [
  32. BlockType.TEXT, BlockType.TITLE,
  33. BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
  34. BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
  35. ]:
  36. for line in block['lines']:
  37. bbox = line['bbox']
  38. page_line_height_list.append(int(bbox[3] - bbox[1]))
  39. if len(page_line_height_list) > 0:
  40. return statistics.median(page_line_height_list)
  41. else:
  42. return 10
  43. def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
  44. page_line_list = []
  45. def add_lines_to_block(b):
  46. line_bboxes = insert_lines_into_block(b['bbox'], line_height, page_w, page_h)
  47. b['lines'] = []
  48. for line_bbox in line_bboxes:
  49. b['lines'].append({'bbox': line_bbox, 'spans': []})
  50. page_line_list.extend(line_bboxes)
  51. for block in fix_blocks:
  52. if block['type'] in [
  53. BlockType.TEXT, BlockType.TITLE,
  54. BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
  55. BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
  56. ]:
  57. if len(block['lines']) == 0:
  58. add_lines_to_block(block)
  59. elif block['type'] in [BlockType.TITLE] and len(block['lines']) == 1 and (block['bbox'][3] - block['bbox'][1]) > line_height * 2:
  60. block['real_lines'] = copy.deepcopy(block['lines'])
  61. add_lines_to_block(block)
  62. else:
  63. for line in block['lines']:
  64. bbox = line['bbox']
  65. page_line_list.append(bbox)
  66. elif block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
  67. block['real_lines'] = copy.deepcopy(block['lines'])
  68. add_lines_to_block(block)
  69. for block in footnote_blocks:
  70. footnote_block = {'bbox': block[:4]}
  71. add_lines_to_block(footnote_block)
  72. if len(page_line_list) > 200: # layoutreader最高支持512line
  73. return None
  74. # 使用layoutreader排序
  75. x_scale = 1000.0 / page_w
  76. y_scale = 1000.0 / page_h
  77. boxes = []
  78. # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
  79. for left, top, right, bottom in page_line_list:
  80. if left < 0:
  81. logger.warning(
  82. f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  83. ) # noqa: E501
  84. left = 0
  85. if right > page_w:
  86. logger.warning(
  87. f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  88. ) # noqa: E501
  89. right = page_w
  90. if top < 0:
  91. logger.warning(
  92. f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  93. ) # noqa: E501
  94. top = 0
  95. if bottom > page_h:
  96. logger.warning(
  97. f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
  98. ) # noqa: E501
  99. bottom = page_h
  100. left = round(left * x_scale)
  101. top = round(top * y_scale)
  102. right = round(right * x_scale)
  103. bottom = round(bottom * y_scale)
  104. assert (
  105. 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
  106. ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121
  107. boxes.append([left, top, right, bottom])
  108. model_manager = ModelSingleton()
  109. model = model_manager.get_model('layoutreader')
  110. with torch.no_grad():
  111. orders = do_predict(boxes, model)
  112. sorted_bboxes = [page_line_list[i] for i in orders]
  113. return sorted_bboxes
  114. def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
  115. # block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
  116. x0, y0, x1, y1 = block_bbox
  117. block_height = y1 - y0
  118. block_weight = x1 - x0
  119. # 如果block高度小于n行正文,则直接返回block的bbox
  120. if line_height * 2 < block_height:
  121. if (
  122. block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
  123. ): # 可能是双列结构,可以切细点
  124. lines = int(block_height / line_height)
  125. else:
  126. # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
  127. if block_weight > page_w * 0.4:
  128. lines = 3
  129. elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
  130. lines = int(block_height / line_height)
  131. else: # 判断长宽比
  132. if block_height / block_weight > 1.2: # 细长的不分
  133. return [[x0, y0, x1, y1]]
  134. else: # 不细长的还是分成两行
  135. lines = 2
  136. line_height = (y1 - y0) / lines
  137. # 确定从哪个y位置开始绘制线条
  138. current_y = y0
  139. # 用于存储线条的位置信息[(x0, y), ...]
  140. lines_positions = []
  141. for i in range(lines):
  142. lines_positions.append([x0, current_y, x1, current_y + line_height])
  143. current_y += line_height
  144. return lines_positions
  145. else:
  146. return [[x0, y0, x1, y1]]
  147. def model_init(model_name: str):
  148. from transformers import LayoutLMv3ForTokenClassification
  149. device_name = get_device()
  150. device = torch.device(device_name)
  151. bf_16_support = False
  152. if device_name.startswith("cuda"):
  153. if torch.cuda.get_device_properties(device).major >= 8:
  154. bf_16_support = True
  155. elif device_name.startswith("mps"):
  156. bf_16_support = True
  157. if model_name == 'layoutreader':
  158. # 检测modelscope的缓存目录是否存在
  159. layoutreader_model_dir = os.path.join(auto_download_and_get_model_root_path(ModelPath.layout_reader), ModelPath.layout_reader)
  160. if os.path.exists(layoutreader_model_dir):
  161. model = LayoutLMv3ForTokenClassification.from_pretrained(
  162. layoutreader_model_dir
  163. )
  164. else:
  165. logger.warning(
  166. 'local layoutreader model not exists, use online model from huggingface'
  167. )
  168. model = LayoutLMv3ForTokenClassification.from_pretrained(
  169. 'hantian/layoutreader'
  170. )
  171. if bf_16_support:
  172. model.to(device).eval().bfloat16()
  173. else:
  174. model.to(device).eval()
  175. else:
  176. logger.error('model name not allow')
  177. exit(1)
  178. return model
  179. class ModelSingleton:
  180. _instance = None
  181. _models = {}
  182. def __new__(cls, *args, **kwargs):
  183. if cls._instance is None:
  184. cls._instance = super().__new__(cls)
  185. return cls._instance
  186. def get_model(self, model_name: str):
  187. if model_name not in self._models:
  188. self._models[model_name] = model_init(model_name=model_name)
  189. return self._models[model_name]
  190. def do_predict(boxes: List[List[int]], model) -> List[int]:
  191. from mineru.model.reading_order.layout_reader import (
  192. boxes2inputs, parse_logits, prepare_inputs)
  193. with warnings.catch_warnings():
  194. warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
  195. inputs = boxes2inputs(boxes)
  196. inputs = prepare_inputs(inputs, model)
  197. logits = model(**inputs).logits.cpu().squeeze(0)
  198. return parse_logits(logits, len(boxes))
  199. def cal_block_index(fix_blocks, sorted_bboxes):
  200. if sorted_bboxes is not None:
  201. # 使用layoutreader排序
  202. for block in fix_blocks:
  203. line_index_list = []
  204. if len(block['lines']) == 0:
  205. block['index'] = sorted_bboxes.index(block['bbox'])
  206. else:
  207. for line in block['lines']:
  208. line['index'] = sorted_bboxes.index(line['bbox'])
  209. line_index_list.append(line['index'])
  210. median_value = statistics.median(line_index_list)
  211. block['index'] = median_value
  212. # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
  213. if block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
  214. if 'real_lines' in block:
  215. block['virtual_lines'] = copy.deepcopy(block['lines'])
  216. block['lines'] = copy.deepcopy(block['real_lines'])
  217. del block['real_lines']
  218. else:
  219. # 使用xycut排序
  220. block_bboxes = []
  221. for block in fix_blocks:
  222. # 如果block['bbox']任意值小于0,将其置为0
  223. block['bbox'] = [max(0, x) for x in block['bbox']]
  224. block_bboxes.append(block['bbox'])
  225. # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
  226. if block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
  227. if 'real_lines' in block:
  228. block['virtual_lines'] = copy.deepcopy(block['lines'])
  229. block['lines'] = copy.deepcopy(block['real_lines'])
  230. del block['real_lines']
  231. import numpy as np
  232. from mineru.model.reading_order.xycut import recursive_xy_cut
  233. random_boxes = np.array(block_bboxes)
  234. np.random.shuffle(random_boxes)
  235. res = []
  236. recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
  237. assert len(res) == len(block_bboxes)
  238. sorted_boxes = random_boxes[np.array(res)].tolist()
  239. for i, block in enumerate(fix_blocks):
  240. block['index'] = sorted_boxes.index(block['bbox'])
  241. # 生成line index
  242. sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
  243. line_inedx = 1
  244. for block in sorted_blocks:
  245. for line in block['lines']:
  246. line['index'] = line_inedx
  247. line_inedx += 1
  248. return fix_blocks
  249. def revert_group_blocks(blocks):
  250. image_groups = {}
  251. table_groups = {}
  252. new_blocks = []
  253. for block in blocks:
  254. if block['type'] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
  255. group_id = block['group_id']
  256. if group_id not in image_groups:
  257. image_groups[group_id] = []
  258. image_groups[group_id].append(block)
  259. elif block['type'] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
  260. group_id = block['group_id']
  261. if group_id not in table_groups:
  262. table_groups[group_id] = []
  263. table_groups[group_id].append(block)
  264. else:
  265. new_blocks.append(block)
  266. for group_id, blocks in image_groups.items():
  267. new_blocks.append(process_block_list(blocks, BlockType.IMAGE_BODY, BlockType.IMAGE))
  268. for group_id, blocks in table_groups.items():
  269. new_blocks.append(process_block_list(blocks, BlockType.TABLE_BODY, BlockType.TABLE))
  270. return new_blocks
  271. def process_block_list(blocks, body_type, block_type):
  272. indices = [block['index'] for block in blocks]
  273. median_index = statistics.median(indices)
  274. body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
  275. return {
  276. 'type': block_type,
  277. 'bbox': body_bbox,
  278. 'blocks': blocks,
  279. 'index': median_index,
  280. }