@sooftware Can you kindly explain to me why the output lengths and targets are so different? :/ (also in outputs I get negative floats). Example shown below
The outputs are of shape [32,490,16121] (where 16121 is the len of my vocab) What is the 490 dimensions
Also the outputs are probabilities right?
(outputs)
tensor([[[-9.7001, -9.6490, -9.6463, ..., -9.6936, -9.6430, -9.7431],
[-9.6997, -9.6487, -9.6470, ..., -9.6903, -9.6450, -9.7416],
[-9.6999, -9.6477, -9.6479, ..., -9.6898, -9.6453, -9.7417],
...,
[-9.7006, -9.6449, -9.6513, ..., -9.6889, -9.6477, -9.7405],
[-9.7003, -9.6448, -9.6512, ..., -9.6893, -9.6477, -9.7410],
[-9.7007, -9.6453, -9.6513, ..., -9.6892, -9.6466, -9.7403]],
[[-9.6844, -9.6316, -9.6387, ..., -9.6880, -9.6269, -9.7657],
[-9.6834, -9.6299, -9.6404, ..., -9.6872, -9.6283, -9.7642],
[-9.6834, -9.6334, -9.6387, ..., -9.6864, -9.6290, -9.7616],
...,
[-9.6840, -9.6299, -9.6431, ..., -9.6830, -9.6304, -9.7608],
[-9.6838, -9.6297, -9.6428, ..., -9.6834, -9.6303, -9.7609],
[-9.6842, -9.6300, -9.6428, ..., -9.6837, -9.6292, -9.7599]],
[[-9.6966, -9.6386, -9.6458, ..., -9.6896, -9.6375, -9.7521],
[-9.6974, -9.6374, -9.6462, ..., -9.6890, -9.6369, -9.7516],
[-9.6974, -9.6405, -9.6456, ..., -9.6876, -9.6378, -9.7491],
...,
[-9.6978, -9.6336, -9.6493, ..., -9.6851, -9.6419, -9.7490],
[-9.6971, -9.6334, -9.6487, ..., -9.6863, -9.6411, -9.7501],
[-9.6972, -9.6338, -9.6489, ..., -9.6867, -9.6396, -9.7497]],
...,
[[-9.7005, -9.6249, -9.6588, ..., -9.6762, -9.6557, -9.7555],
[-9.7028, -9.6266, -9.6597, ..., -9.6765, -9.6574, -9.7542],
[-9.7016, -9.6240, -9.6605, ..., -9.6761, -9.6576, -9.7553],
...,
[-9.7036, -9.6237, -9.6624, ..., -9.6728, -9.6590, -9.7524],
[-9.7034, -9.6235, -9.6620, ..., -9.6735, -9.6589, -9.7530],
[-9.7038, -9.6240, -9.6622, ..., -9.6738, -9.6582, -9.7524]],
[[-9.7058, -9.6305, -9.6566, ..., -9.6739, -9.6557, -9.7466],
[-9.7061, -9.6273, -9.6569, ..., -9.6774, -9.6564, -9.7499],
[-9.7046, -9.6280, -9.6576, ..., -9.6772, -9.6575, -9.7498],
...,
[-9.7060, -9.6263, -9.6609, ..., -9.6714, -9.6561, -9.7461],
[-9.7055, -9.6262, -9.6605, ..., -9.6723, -9.6558, -9.7469],
[-9.7058, -9.6270, -9.6606, ..., -9.6725, -9.6552, -9.7460]],
[[-9.7101, -9.6312, -9.6570, ..., -9.6736, -9.6551, -9.7420],
[-9.7102, -9.6307, -9.6579, ..., -9.6733, -9.6576, -9.7418],
[-9.7078, -9.6281, -9.6598, ..., -9.6704, -9.6596, -9.7418],
...,
[-9.7084, -9.6288, -9.6605, ..., -9.6706, -9.6588, -9.7399],
[-9.7081, -9.6286, -9.6600, ..., -9.6714, -9.6584, -9.7406],
[-9.7085, -9.6291, -9.6601, ..., -9.6717, -9.6577, -9.7398]]],
device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
(output_lengths)
tensor([312, 260, 315, 320, 317, 275, 308, 291, 272, 300, 262, 227, 303, 252,
298, 256, 303, 251, 284, 259, 263, 286, 209, 262, 166, 194, 149, 212,
121, 114, 110, 57], device='cuda:0', dtype=torch.int32)
(target_lengths)
tensor([57, 55, 54, 50, 49, 49, 49, 48, 48, 47, 43, 42, 41, 40, 40, 39, 37, 37,
36, 36, 36, 35, 34, 33, 29, 27, 26, 24, 20, 19, 17, 9])
I am using the following code for training and evaluation
import torch
import time
import sys
from google.colab import output
import torch.nn as nn
from conformer import Conformer
import torchmetrics
import random
cuda = torch.cuda.is_available()
device = torch.device('cuda' if cuda else 'cpu')
print('Device:', device)
################################################################################
def train_model(model, optimizer, criterion, loader, metric):
running_loss = 0.0
for i, (audio,audio_len, translations, translation_len) in enumerate(loader):
# with output.use_tags('some_outputs'):
# sys.stdout.write('Batch: '+ str(i+1)+'/290')
# sys.stdout.flush();
#sorting inputs and targets to have targets in descending order based on len
sorted_list,sorted_indices=torch.sort(translation_len,descending=True)
sorted_audio=torch.zeros((32,201,1963),dtype=torch.float)
sorted_audio_len=torch.zeros(32,dtype=torch.int)
sorted_translations=torch.zeros((32,78),dtype=torch.int)
sorted_translation_len=sorted_list
for index, contentof in enumerate(translation_len):
sorted_audio[index]=audio[sorted_indices[index]]
sorted_audio_len[index]=audio_len[sorted_indices[index]]
sorted_translations[index]=translations[sorted_indices[index]]
#transpose inputs from (batch, dim, seq_len) to (batch, seq_len, dim)
inputs=sorted_audio.to(device)
inputs=torch.transpose(inputs, 1, 2)
input_lengths=sorted_audio_len
targets=sorted_translations.to(device)
target_lengths=sorted_translation_len
optimizer.zero_grad()
# Forward propagate
outputs, output_lengths = model(inputs, input_lengths)
# print(outputs)
# Calculate CTC Loss
loss = criterion(outputs.transpose(0, 1), targets, output_lengths, target_lengths)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
output.clear(output_tags='some_outputs')
loss_per_epoch=running_loss/(i+1)
# print(f'Loss: {loss_per_epoch:.3f}')
return loss_per_epoch
################################################################################
def eval_model(model, optimizer, criterion, loader, metric):
running_loss = 0.0
wer_calc=0.0
random_index_per_epoch= random.randint(0, 178)
for i, (audio,audio_len, translations, translation_len) in enumerate(loader):
# with output.use_tags('some_outputs'):
# sys.stdout.write('Batch: '+ str(i+1)+'/72')
# sys.stdout.flush();
#sorting inputs and targets to have targets in descending order based on len
sorted_list,sorted_indices=torch.sort(translation_len,descending=True)
sorted_audio=torch.zeros((32,201,1963),dtype=torch.float)
sorted_audio_len=torch.zeros(32,dtype=torch.int)
sorted_translations=torch.zeros((32,78),dtype=torch.int)
sorted_translation_len=sorted_list
for index, contentof in enumerate(translation_len):
sorted_audio[index]=audio[sorted_indices[index]]
sorted_audio_len[index]=audio_len[sorted_indices[index]]
sorted_translations[index]=translations[sorted_indices[index]]
#transpose inputs from (batch, dim, seq_len) to (batch, seq_len, dim)
inputs=sorted_audio.to(device)
inputs=torch.transpose(inputs, 1, 2)
input_lengths=sorted_audio_len
targets=sorted_translations.to(device)
target_lengths=sorted_translation_len
# Forward propagate
outputs, output_lengths = model(inputs, input_lengths)
# print(outputs)
# Calculate CTC Loss
loss = criterion(outputs.transpose(0, 1), targets, output_lengths, target_lengths)
print(output_lengths)
print(target_lengths)
# outputs_in_words=words_vocab.convert_pred_to_words(outputs.transpose(0, 1))
# targets_in_words=words_vocab.convert_pred_to_words(targets)
# wer=metrics_calculation(metric, outputs_in_words,targets_in_words)
break
if (i==random_index_per_epoch):
print(outputs_in_words,targets_in_words)
running_loss += loss.item()
# wer_calc += wer
output.clear(output_tags='some_outputs')
loss_per_epoch=running_loss/(i+1)
wer_per_epoch=wer_calc/(i+1)
return loss_per_epoch, wer_per_epoch
################################################################################
def train_eval_model(epochs):
#conformer model init
model = nn.DataParallel(Conformer(num_classes=16121, input_dim=201, encoder_dim=32, num_encoder_layers=1)).to(device)
# Optimizers specified in the torch.optim package
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
#loss function
criterion = nn.CTCLoss().to(device)
#metrics init
metric=torchmetrics.WordErrorRate()
for epoch in range(epochs):
print("Epoch", epoch+1)
############################################################################
#TRAINING
model.train()
print("Training")
# epoch_loss=train_model(model=model,optimizer=optimizer, criterion=criterion, loader=train_loader, metric=metric)
# print(f'Loss: {epoch_loss:.3f}')
# print(f'WER: {epoch_wer:.3f}')
############################################################################
#EVALUATION
model.train(False)
print("Validation")
epoch_val_loss, epoch_val_wer=eval_model(model=model,optimizer=optimizer, criterion=criterion, loader=test_loader, metric=metric)
print(f'Loss: {epoch_val_loss:.3f}')
print(f'WER: {epoch_val_wer:.3f}')
################################################################################
def metrics_calculation(metric, predictions, targets):
print(predictions)
print(targets)
wer=metric(predictions, targets)
return wer
train_eval_model(1)