I am trying to use a resnet50 model that I created with this repo, but I can't encode text.
with torch.no_grad():
tmp = clip.tokenize("test")
tmp = tmp.to(device)
print(tmp)
print(tmp.shape)
text_encoded = model.model.encode_text(tmp)
tensor([[49406, 1628, 49407, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
torch.Size([1, 77])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-68003eb3bebb> in <module>()
9 print(tmp)
10 print(tmp.shape)
---> 11 text_encoded = model.model.encode_text(tmp)
12
2 frames
/content/train-CLIP/models/model.py in encode_text(self, text)
343 x = x + self.positional_embedding.type(self.dtype)
344 x = x.permute(1, 0, 2) # NLD -> LND
--> 345 x = self.transformer(x)
346 x = x.permute(1, 0, 2) # LND -> NLD
347 x = self.ln_final(x).type(self.dtype)
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
937 elif input_ids is not None:
938 input_shape = input_ids.size()
--> 939 batch_size, seq_length = input_shape
940 elif inputs_embeds is not None:
941 input_shape = inputs_embeds.size()[:-1]
ValueError: too many values to unpack (expected 2)
Printing x
before self.transformer(x)
results in torch.Size([77, 1, 512])
.
The input shape torch.Size([1, 77])
does match the original clip code and the model loaded with clip seems to work without major problems.
import torch
import clip
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
image = preprocess(Image.open("/test.png")).unsqueeze(0).to(device)
text = clip.tokenize(["test"]).to(device)
print(text)
print(text.shape)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
tensor([[49406, 1628, 49407, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
torch.Size([1, 77])
Not sure what I am doing wrong, since encoding images does seem to work fine with this repo.
with torch.no_grad():
photos_features = model.model.encode_image(image)
photos_features /= photos_features.norm(dim=-1, keepdim=True)
print(photos_features.shape)
torch.Size([1, 768])