Hello thank you for your contribution! I am training to fine-tune the all-MiniLM-L6-v2 (https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) on my data but after the first batch, I get a ValueError and a loss of inf.
ValueError: Expected value argument (Tensor of shape (4, 192, 1, 1)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([4, 192, 1, 1]), scale: torch.Size([4, 192, 1, 1])), but found invalid values:
tensor([[[[nan]],
[[nan]],
.....
Here is my very simple script (I just replaced the data and put the training in a loop). The error that I get is:
import pandas as pd
import numpy as np
from tflow_utils import TransformerGlow, AdamWeightDecayOptimizer
from transformers import AutoTokenizer,AutoModel
model_name_or_path = '/tmp/all-MiniLM-L6-v2'
bertflow = TransformerGlow(model_name_or_path, pooling='mean') # pooling could be 'mean', 'max', 'cls' or 'first-last-avg' (mean pooling over the first and the last layers)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters= [
{
"params": [p for n, p in bertflow.glow.named_parameters() \
if not any(nd in n for nd in no_decay)], # Note only the parameters within bertflow.glow will be updated and the Transformer will be freezed during training.
"weight_decay": 0.01,
},
{
"params": [p for n, p in bertflow.glow.named_parameters() \
if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamWeightDecayOptimizer(
params=optimizer_grouped_parameters,
lr=1e-5,
eps=1e-6,
)
# Important: Remember to shuffle your training data!!! This makes a huge difference!!!
np.random.seed(0)
df = pd.read_csv("data/classification/data_small.csv")
data = df.text.to_list().copy()
np.random.shuffle(data)
bertflow.train()
batch_size = 4
nb_batch = int(np.ceil(len(data) / batch_size))
print(nb_batch)
for batch_id in range(nb_batch):
batch = data[batch_id*batch_size:(batch_id+1)*batch_size]
model_inputs = tokenizer(
batch,
add_special_tokens=True,
return_tensors='pt',
max_length=256,
padding='longest',
truncation=True
)
z, loss = bertflow(model_inputs['input_ids'], model_inputs['attention_mask'], return_loss=True) # Here z is the sentence embedding
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(batch_id, loss)
Do you have any ideas where this could come from ? I have tried different learning rates but it doesn't solve the problem