pipeline_analyze.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import os
  2. import time
  3. from typing import List, Tuple
  4. import PIL.Image
  5. from loguru import logger
  6. from .model_init import MineruPipelineModel
  7. from mineru.utils.config_reader import get_device
  8. from ...utils.pdf_classify import classify
  9. from ...utils.pdf_image_tools import load_images_from_pdf
  10. from ...utils.model_utils import get_vram, clean_memory
  11. os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
  12. os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
  13. class ModelSingleton:
  14. _instance = None
  15. _models = {}
  16. def __new__(cls, *args, **kwargs):
  17. if cls._instance is None:
  18. cls._instance = super().__new__(cls)
  19. return cls._instance
  20. def get_model(
  21. self,
  22. lang=None,
  23. formula_enable=None,
  24. table_enable=None,
  25. ):
  26. key = (lang, formula_enable, table_enable)
  27. if key not in self._models:
  28. self._models[key] = custom_model_init(
  29. lang=lang,
  30. formula_enable=formula_enable,
  31. table_enable=table_enable,
  32. )
  33. return self._models[key]
  34. def custom_model_init(
  35. lang=None,
  36. formula_enable=True,
  37. table_enable=True,
  38. ):
  39. model_init_start = time.time()
  40. # 从配置文件读取model-dir和device
  41. device = get_device()
  42. formula_config = {"enable": formula_enable}
  43. table_config = {"enable": table_enable}
  44. model_input = {
  45. 'device': device,
  46. 'table_config': table_config,
  47. 'formula_config': formula_config,
  48. 'lang': lang,
  49. }
  50. custom_model = MineruPipelineModel(**model_input)
  51. model_init_cost = time.time() - model_init_start
  52. logger.info(f'model init cost: {model_init_cost}')
  53. return custom_model
  54. def doc_analyze(
  55. pdf_bytes_list,
  56. lang_list,
  57. parse_method: str = 'auto',
  58. formula_enable=True,
  59. table_enable=True,
  60. ):
  61. """
  62. 适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能,可能会增加显存使用量,
  63. 可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为128。
  64. """
  65. min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 128))
  66. # 收集所有页面信息
  67. all_pages_info = [] # 存储(dataset_index, page_index, img, ocr, lang, width, height)
  68. all_image_lists = []
  69. all_pdf_docs = []
  70. ocr_enabled_list = []
  71. for pdf_idx, pdf_bytes in enumerate(pdf_bytes_list):
  72. # 确定OCR设置
  73. _ocr_enable = False
  74. if parse_method == 'auto':
  75. if classify(pdf_bytes) == 'ocr':
  76. _ocr_enable = True
  77. elif parse_method == 'ocr':
  78. _ocr_enable = True
  79. ocr_enabled_list.append(_ocr_enable)
  80. _lang = lang_list[pdf_idx]
  81. # 收集每个数据集中的页面
  82. images_list, pdf_doc = load_images_from_pdf(pdf_bytes)
  83. all_image_lists.append(images_list)
  84. all_pdf_docs.append(pdf_doc)
  85. for page_idx in range(len(images_list)):
  86. img_dict = images_list[page_idx]
  87. all_pages_info.append((
  88. pdf_idx, page_idx,
  89. img_dict['img_pil'], _ocr_enable, _lang,
  90. ))
  91. # 准备批处理
  92. images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
  93. batch_size = min_batch_inference_size
  94. batch_images = [
  95. images_with_extra_info[i:i + batch_size]
  96. for i in range(0, len(images_with_extra_info), batch_size)
  97. ]
  98. # 执行批处理
  99. results = []
  100. processed_images_count = 0
  101. for index, batch_image in enumerate(batch_images):
  102. processed_images_count += len(batch_image)
  103. logger.info(
  104. f'Batch {index + 1}/{len(batch_images)}: '
  105. f'{processed_images_count} pages/{len(images_with_extra_info)} pages'
  106. )
  107. batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
  108. results.extend(batch_results)
  109. # 构建返回结果
  110. infer_results = []
  111. for _ in range(len(pdf_bytes_list)):
  112. infer_results.append([])
  113. for i, page_info in enumerate(all_pages_info):
  114. pdf_idx, page_idx, pil_img, _, _ = page_info
  115. result = results[i]
  116. page_info_dict = {'page_no': page_idx, 'width': pil_img.width, 'height': pil_img.height}
  117. page_dict = {'layout_dets': result, 'page_info': page_info_dict}
  118. infer_results[pdf_idx].append(page_dict)
  119. return infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list
  120. def batch_image_analyze(
  121. images_with_extra_info: List[Tuple[PIL.Image.Image, bool, str]],
  122. formula_enable=True,
  123. table_enable=True):
  124. # os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
  125. from .batch_analyze import BatchAnalyze
  126. model_manager = ModelSingleton()
  127. batch_ratio = 1
  128. device = get_device()
  129. if str(device).startswith('npu'):
  130. try:
  131. import torch_npu
  132. if torch_npu.npu.is_available():
  133. torch_npu.npu.set_compile_mode(jit_compile=False)
  134. except Exception as e:
  135. raise RuntimeError(
  136. "NPU is selected as device, but torch_npu is not available. "
  137. "Please ensure that the torch_npu package is installed correctly."
  138. ) from e
  139. if str(device).startswith('npu') or str(device).startswith('cuda'):
  140. vram = get_vram(device)
  141. if vram is not None:
  142. gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
  143. if gpu_memory >= 16:
  144. batch_ratio = 16
  145. elif gpu_memory >= 12:
  146. batch_ratio = 8
  147. elif gpu_memory >= 8:
  148. batch_ratio = 4
  149. elif gpu_memory >= 6:
  150. batch_ratio = 2
  151. else:
  152. batch_ratio = 1
  153. logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
  154. else:
  155. # Default batch_ratio when VRAM can't be determined
  156. batch_ratio = 1
  157. logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}')
  158. batch_model = BatchAnalyze(model_manager, batch_ratio, formula_enable, table_enable)
  159. results = batch_model(images_with_extra_info)
  160. clean_memory(get_device())
  161. return results