Browse Source

fix: include attention mask in model input for improved inference

myhloli 2 months ago
parent
commit
7d3a76f80f
1 changed files with 4 additions and 2 deletions
  1. 4 2
      mineru/backend/vlm/hf_predictor.py

+ 4 - 2
mineru/backend/vlm/hf_predictor.py

@@ -137,12 +137,14 @@ class HuggingfacePredictor(BasePredictor):
         image_tensor = image_tensor.to(device=self.model.device, dtype=self.model.dtype)
         image_tensor = image_tensor.to(device=self.model.device, dtype=self.model.dtype)
         image_sizes = [[*image_obj.size]]
         image_sizes = [[*image_obj.size]]
 
 
-        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
-        input_ids = input_ids.to(device=self.model.device)
+        encoded_inputs = self.tokenizer(prompt, return_tensors="pt")
+        input_ids = encoded_inputs.input_ids.to(device=self.model.device)
+        attention_mask = encoded_inputs.attention_mask.to(device=self.model.device)
 
 
         with torch.inference_mode():
         with torch.inference_mode():
             output_ids = self.model.generate(
             output_ids = self.model.generate(
                 input_ids,
                 input_ids,
+                attention_mask=attention_mask,
                 images=image_tensor,
                 images=image_tensor,
                 image_sizes=image_sizes,
                 image_sizes=image_sizes,
                 use_cache=True,
                 use_cache=True,