Can't get the Predictions by converting the model to torch.float16

by taher30 - opened Aug 12

Aug 12

I used the below code to convert the model to float16 and ran a loop to convert the inputs to float16 as well. But I still get an error while making predictions.

Code:

import requests
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

device = 'cuda'
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
image_2 = Image.open(requests.get(image_url, stream=True).raw)

model_id = "IDEA-Research/grounding-dino-base"
processor = AutoProcessor.from_pretrained(model_id)
text = 'a cat. a remote control.'
labels = [text, text]
input = processor(images=[image, image_2], text= labels, return_tensors="pt").to(device)

model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

for key in input.keys():
input[key] = input[key].to(device)
if key != 'input_ids':
input[key] = input[key].to(torch.float16)
print(input[key].dtype , input[key].device)

with torch.no_grad():
start = time.time()
outputs = model(**input)
end = time.time()

Error:

"RuntimeError Traceback (most recent call last)
Cell In[111], line 13
11 with torch.no_grad():
12 start = time.time()
---> 13 outputs = model(**input)
14 end = time.time()
15 print("Time Taken:", end - start)

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\torch\nn\modules\module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\torch\nn\modules\module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\transformers\models\grounding_dino\modeling_grounding_dino.py:3016, in GroundingDinoForObjectDetection.forward(self, pixel_values, input_ids, token_type_ids, attention_mask, pixel_mask, encoder_outputs, output_attentions, output_hidden_states, return_dict, labels)
3013 attention_mask = torch.ones_like(input_ids)
3015 # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
-> 3016 outputs = self.model(
3017 pixel_values=pixel_values,
3018 input_ids=input_ids,
3019 token_type_ids=token_type_ids,
3020 attention_mask=attention_mask,
3021 pixel_mask=pixel_mask,
3022 encoder_outputs=encoder_outputs,
3023 output_attentions=output_attentions,
3024 output_hidden_states=output_hidden_states,
3025 return_dict=return_dict,
3026 )
3028 idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
3029 enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\transformers\models\grounding_dino\modeling_grounding_dino.py:2283, in GroundingDinoModel.forward(self, pixel_values, input_ids, token_type_ids, attention_mask, pixel_mask, encoder_outputs, output_attentions, output_hidden_states, return_dict)
2280 text_token_mask = text_token_mask[:, :max_text_len]
2282 # Extract text features from text backbone
-> 2283 text_outputs = self.text_backbone(
2284 input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict
2285 )
2286 text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0]
2287 text_features = self.text_projection(text_features)

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\transformers\models\bert\modeling_bert.py:1073, in BertModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
1070 else:
1071 token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-> 1073 embedding_output = self.embeddings(
1074 input_ids=input_ids,
1075 position_ids=position_ids,
1076 token_type_ids=token_type_ids,
1077 inputs_embeds=inputs_embeds,
1078 past_key_values_length=past_key_values_length,
1079 )
1081 if attention_mask is None:
1082 attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\transformers\models\bert\modeling_bert.py:211, in BertEmbeddings.forward(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)
209 if inputs_embeds is None:
210 inputs_embeds = self.word_embeddings(input_ids)
--> 211 token_type_embeddings = self.token_type_embeddings(token_type_ids)
213 embeddings = inputs_embeds + token_type_embeddings
214 if self.position_embedding_type == "absolute":

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\torch\nn\modules\sparse.py:163, in Embedding.forward(self, input)
162 def forward(self, input: Tensor) -> Tensor:
--> 163 return F.embedding(
164 input, self.weight, self.padding_idx, self.max_norm,
165 self.norm_type, self.scale_grad_by_freq, self.sparse)

File c:\Users\patra\miniconda3\envs\new_test\lib\site-packages\torch\nn\functional.py:2264, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2258 # Note [embedding_renorm set_grad_enabled]
2259 # XXX: equivalent to
2260 # with torch.no_grad():
2261 # torch.embedding_renorm_
2262 # remove once script supports set_grad_enabled
2263 no_grad_embedding_renorm(weight, input, max_norm, norm_type)
-> 2264 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)"

EduardoPacheco

Aug 13

token_type_ids shouldn't be float16, however I did find some other stuff that might be worth for me to take a look at thanks!.

I would recommend to change your code to:

import requests
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

device = 'cuda'
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
image_2 = Image.open(requests.get(image_url, stream=True).raw)

model_id = "IDEA-Research/grounding-dino-base"
processor = AutoProcessor.from_pretrained(model_id)
text = 'a cat. a remote control.'
labels = [text, text]
input = processor(images=[image, image_2], text= labels, return_tensors="pt").to(device)

model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
    start.record()
    outputs = model(**inputs)
    end.record()
    
torch.cuda.synchronize()
elapsed = start.elapsed_time(end) # ms

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment