Details
Hello ! I want to add adapter approach in my text-classification pre-trained bert, but I did not find a good explanation in the documentation on how to that.
My model class is the following:
class BertClassifier(nn.Module):
"""Bert Model for Classification Tasks."""
def __init__(self, freeze_bert=True):
"""
@param bert: a BertModel object
@param classifier: a torch.nn.Module classifier
@param freeze_bert (bool): Set `False` to fine-tune the BERT model
"""
super(BertClassifier, self).__init__()
# Instantiate BERT model
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
self.bert = BertAdapterModel.from_pretrained(PREETRAINED_MODEL')
self.D_in = 1024
self.H = 512
self.D_out = 2
# Add a new adapter
self.bert.add_adapter("thermo_cl",set_active=True)
self.bert.train_adapter(["thermo_cl"])
# Instantiate the classifier head with some one-layer feed-forward classifier
self.classifier = nn.Sequential(
nn.Linear(self.D_in, 512),
nn.Tanh(),
nn.Linear(512, self.D_out),
nn.Tanh()
)
# Freeze the BERT model
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = True
def forward(self, input_ids, attention_mask):
''' Feed input to BERT and the classifier to compute logits.
@param input_ids (torch.Tensor): an input tensor with shape (batch_size,
max_length)
@param attention_mask (torch.Tensor): a tensor that hold attention mask
information with shape (batch_size, max_length)
@return logits (torch.Tensor): an output tensor with shape (batch_size,
num_labels) '''
# Feed input to BERT
outputs = self.bert(input_ids=input_ids,
attention_mask=attention_mask)
# Extract the last hidden state of the token `[CLS]` for classification task
last_hidden_state_cls = outputs[0][:, 0, :]
# Feed input to classifier to compute logits
logits = self.classifier(last_hidden_state_cls)
return logits
The training loop is the following:
def initialize_model(epochs):
""" Initialize the Bert Classifier, the optimizer and the learning rate scheduler."""
# Instantiate Bert Classifier
bert_classifier = BertClassifier(freeze_bert=False) #false=freezed
# Tell PyTorch to run the model on GPU
bert_classifier = bert_classifier.to(device)
# Create the optimizer
optimizer = AdamW(bert_classifier.parameters(),
lr=lr, # Default learning rate
eps=1e-8 # Default epsilon value
)
# Total number of training steps
total_steps = len(train_dataloader) * epochs
# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0, # Default value
num_training_steps=total_steps)
return bert_classifier, optimizer, scheduler
def train(model, train_dataloader, val_dataloader, valid_loss_min_input, checkpoint_path, best_model_path, start_epochs, epochs, evaluation=True):
"""Train the BertClassifier model."""
# Start training loop
logging.info("--Start training...\n")
# Initialize tracker for minimum validation loss
valid_loss_min = valid_loss_min_input
for epoch_i in range(start_epochs, epochs):
..............................
if evaluation == True:
# After the completion of each training epoch, measure the model's performance
# on our validation set.
val_loss, val_accuracy = evaluate(model, val_dataloader)
# Print performance over the entire training data
time_elapsed = time.time() - t0_epoch
logging.info(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^10.6f} | {time_elapsed:^9.2f}")
logging.info("-"*70)
logging.info("\n")
# create checkpoint variable and add important data
checkpoint = {
'epoch': epoch_i + 1,
'valid_loss_min': val_loss,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
}
# save checkpoint
save_ckp(checkpoint, False, checkpoint_path, best_model_path)
## TODO: save the model if validation loss has decreased
if val_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,val_loss))
# save checkpoint as best model
save_ckp(checkpoint, True, checkpoint_path, best_model_path)
valid_loss_min = val_loss
model.save_adapter("./final_adapter", "thermo_cl")
logging.info("-----------------Training complete--------------------------")
bert_classifier, optimizer, scheduler = initialize_model(epochs=n_epochs)
train(model = bert_classifier....)
As you can see I have my own personalized classification head, so I don't want to use the .add_classification_head() method. Is it correct to train and activate the adapter in this way?
I would like to know if I'm using adapter properly and also how to save the checkpoint and my model weights because at the end of the training (where i suppose to save the adapter) I receive this error:
AttributeError: 'BertClassifier' object has no attribute 'save_adapter'
Thanks for the help!
question Stale