image_processor.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. import ast
  2. import asyncio
  3. import re
  4. from typing import List, Optional, Union
  5. import numpy as np
  6. from sglang.version import __version__ as sglang_version
  7. if sglang_version >= "0.4.9":
  8. # sglang >= 0.4.9
  9. from sglang.srt.multimodal.processors.base_processor import (
  10. BaseMultimodalProcessor as BaseProcessor,
  11. )
  12. from sglang.srt.multimodal.mm_utils import divide_to_patches, expand2square, select_best_resolution
  13. else:
  14. # 0.4.7 <= sglang < 0.4.9
  15. from sglang.srt.managers.multimodal_processors.base_processor import (
  16. BaseMultimodalProcessor as BaseProcessor,
  17. )
  18. from sglang.srt.mm_utils import divide_to_patches, expand2square, select_best_resolution
  19. get_global_processor = None
  20. from sglang.srt.utils import load_image, logger
  21. from sglang.utils import get_exception_traceback
  22. from .model import Mineru2QwenForCausalLM
  23. # image_best_res is only resized (not padded).
  24. def process_anyres_image(image, processor, grid_pinpoints):
  25. if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
  26. patch_size = processor.crop_size["height"]
  27. assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
  28. matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
  29. range_start = tuple(map(int, matches[0]))
  30. range_end = tuple(map(int, matches[-1]))
  31. grid_pinpoints = [
  32. (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
  33. ]
  34. grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
  35. if type(grid_pinpoints) is list:
  36. possible_resolutions = grid_pinpoints
  37. else:
  38. possible_resolutions = ast.literal_eval(grid_pinpoints)
  39. best_resolution = select_best_resolution(image.size, possible_resolutions)
  40. image_best_res = image.resize(best_resolution) # <<<<<<< Here changed
  41. patches = divide_to_patches(image_best_res, processor.crop_size["height"])
  42. image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"]))
  43. image_patches = [image_original_resize] + patches
  44. image_patches = [processor.preprocess(image_patch)["pixel_values"][0] for image_patch in image_patches]
  45. return np.stack(image_patches, axis=0)
  46. class Mineru2ImageProcessor(BaseProcessor):
  47. def __init__(self, hf_config, server_args, _processor):
  48. super().__init__(hf_config, server_args, _processor)
  49. @staticmethod
  50. def _process_single_image_task(
  51. image_data: Union[str, bytes],
  52. image_aspect_ratio: Optional[str] = None,
  53. image_grid_pinpoints: Optional[str] = None,
  54. image_processor=None,
  55. ):
  56. if image_processor is None:
  57. assert get_global_processor is not None
  58. image_processor = get_global_processor().image_processor
  59. try:
  60. image, image_size = load_image(image_data)
  61. if image_size is not None:
  62. # It is a video with multiple images
  63. image_hash = hash(image_data)
  64. pixel_values = image_processor(image)["pixel_values"]
  65. pixel_values = np.stack(pixel_values, axis=0)
  66. return pixel_values, image_hash, image_size
  67. else:
  68. # It is an image
  69. image_hash = hash(image_data)
  70. if image_aspect_ratio == "pad":
  71. image = expand2square(
  72. image,
  73. tuple(int(x * 255) for x in image_processor.image_mean),
  74. )
  75. pixel_values = image_processor(image.convert("RGB"))["pixel_values"][0]
  76. elif image_aspect_ratio == "anyres" or (image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio):
  77. pixel_values = process_anyres_image(image, image_processor, image_grid_pinpoints)
  78. else:
  79. pixel_values = image_processor(image)["pixel_values"][0]
  80. return pixel_values, image_hash, image.size
  81. except Exception:
  82. logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
  83. async def _process_single_image(self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str):
  84. if hasattr(self, "cpu_executor"):
  85. executor = self.cpu_executor
  86. else:
  87. executor = self.executor
  88. if get_global_processor is not None:
  89. image_processor = None # save ipc cost
  90. else:
  91. image_processor = self._processor.image_processor
  92. if executor is not None:
  93. loop = asyncio.get_running_loop()
  94. return await loop.run_in_executor(
  95. executor,
  96. Mineru2ImageProcessor._process_single_image_task,
  97. image_data,
  98. aspect_ratio,
  99. grid_pinpoints,
  100. image_processor,
  101. )
  102. else:
  103. return self._process_single_image_task(
  104. image_data,
  105. aspect_ratio,
  106. grid_pinpoints,
  107. image_processor,
  108. )
  109. async def process_mm_data_async(
  110. self,
  111. image_data: List[Union[str, bytes]],
  112. input_text,
  113. request_obj,
  114. *args,
  115. **kwargs,
  116. ):
  117. from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
  118. if not image_data:
  119. return None
  120. modalities = request_obj.modalities or ["image"]
  121. aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
  122. grid_pinpoints = (
  123. self.hf_config.image_grid_pinpoints
  124. if hasattr(self.hf_config, "image_grid_pinpoints")
  125. and "anyres" in aspect_ratio
  126. else None
  127. )
  128. if isinstance(image_data, str):
  129. image_data = [image_data]
  130. if isinstance(image_data, list) and len(image_data) > 0:
  131. if "multi-images" in modalities or "video" in modalities:
  132. # Multiple images
  133. aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
  134. pixel_values, data_hashes, image_sizes = [], [], []
  135. res = []
  136. for img_data in image_data:
  137. res.append(
  138. self._process_single_image(
  139. img_data, aspect_ratio, grid_pinpoints
  140. )
  141. )
  142. res = await asyncio.gather(*res)
  143. for pixel_v, image_h, image_s in res:
  144. pixel_values.append(pixel_v)
  145. data_hashes.append(image_h)
  146. image_sizes.append(image_s)
  147. if isinstance(pixel_values[0], np.ndarray):
  148. pixel_values = np.stack(pixel_values, axis=0)
  149. else:
  150. # A single image
  151. pixel_values, image_hash, image_size = await self._process_single_image(
  152. image_data[0], aspect_ratio, grid_pinpoints
  153. )
  154. image_sizes = [image_size]
  155. else:
  156. raise ValueError(f"Invalid image data: {image_data}")
  157. modality = Modality.IMAGE
  158. if isinstance(request_obj.modalities, list):
  159. if request_obj.modalities[0] == "multi-images":
  160. modality = Modality.MULTI_IMAGES
  161. elif request_obj.modalities[0] == "video":
  162. modality = Modality.VIDEO
  163. if sglang_version >= "0.4.9.post3":
  164. # sglang >= 0.4.9.post3
  165. return {
  166. "mm_items": [
  167. MultimodalDataItem(
  168. feature=pixel_values,
  169. model_specific_data={
  170. "image_sizes": image_sizes,
  171. },
  172. modality=modality,
  173. )
  174. ],
  175. }
  176. else:
  177. # 0.4.7 <= sglang <= 0.4.9.post2
  178. return {
  179. "mm_items": [
  180. MultimodalDataItem(
  181. pixel_values=pixel_values,
  182. image_sizes=image_sizes,
  183. modality=modality,
  184. )
  185. ],
  186. }
  187. ImageProcessorMapping = {Mineru2QwenForCausalLM: Mineru2ImageProcessor}