Dear all,
It seems that torchbearer
does not want to work for me. I am trying to simply classify images using resnet. You can find my code here (https://github.com/FrancescoSaverioZuppichini/PyTorch-Deep-Learning-Template/tree/feature/cuda-error), the main training logic is:
import time
from comet_ml import Experiment
import torchbearer
import torch.optim as optim
import torch.nn as nn
from torchsummary import summary
from Project import Project
from data import get_dataloaders
from data.transformation import train_transform, val_transform
from models import MyCNN, resnet18
from utils import device, show_dl
from torchbearer import Trial
from torchbearer.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from callbacks import CometCallback
from logger import logging
if __name__ == '__main__':
project = Project()
# our hyperparameters
params = {
'lr': 0.001,
'batch_size': 64,
'epochs': 1,
'model': 'resnet18-finetune',
'id': time.time()
}
logging.info(f'Using device={device} 🚀')
# everything starts with the data
train_dl, val_dl, test_dl = get_dataloaders(
project.data_dir,
val_transform=val_transform,
train_transform=train_transform,
batch_size=params['batch_size'],
num_workers=4,
)
# is always good practice to visualise some of the train and val images to be sure data-aug
# is applied properly
# show_dl(train_dl)
# show_dl(test_dl)
# define our comet experiment
experiment = Experiment(api_key='8THqoAxomFyzBgzkStlY95MOf',
project_name="dl-pytorch-template", workspace="francescosaveriozuppichini")
experiment.log_parameters(params)
# create our special resnet18
cnn = resnet18(n_classes=2).to(device)
loss = nn.CrossEntropyLoss()
# print the model summary to show useful information
logging.info(summary(cnn, (3, 224, 244)))
# define custom optimizer and instantiace the trainer `Model`
optimizer = optim.Adam(cnn.parameters(), lr=params['lr'])
# create our Trial object to train and evaluate the model
trial = Trial(cnn, optimizer, loss, metrics=['acc', 'loss'],
callbacks=[
CometCallback(experiment),
ReduceLROnPlateau(monitor='val_loss',
factor=0.1, patience=5),
EarlyStopping(monitor='val_acc', patience=5, mode='max'),
CSVLogger(str(project.checkpoint_dir / 'history.csv')),
ModelCheckpoint(str(project.checkpoint_dir / f'{params["id"]}-best.pt'), monitor='val_acc', mode='max')
]).to(device)
trial.with_generators(train_generator=train_dl,
val_generator=val_dl, test_generator=test_dl)
history = trial.run(epochs=params['epochs'], verbose=1)
logging.info(history)
preds = trial.evaluate(data_key=torchbearer.TEST_DATA)
logging.info(f'test preds=({preds})')
# experiment.log_metric('test_acc', test_acc)
I am running the same logic (same model) with poutyne
and I have no problems. I really would like to switch to torchbearer
Error is:
2020-02-03 13:32:03,386 - [INFO] - None
0%| | 0/1 [00:00<?, ?it/s]C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [13,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [17,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [20,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [21,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [22,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [23,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [25,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [29,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THCUNN/ClassNLLCriterion.cu:106: block: [0,0,0], thread: [30,0,0] Assertion `t >= 0 && t < n_classes` failed.
Traceback (most recent call last):
File "c:/Users/Francesco/Documents/PyTorch-Deep-Learning-Template/main.py", line 64, in <module>
history = trial.run(epochs=params['epochs'], verbose=1)
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torchbearer\trial.py", line 133, in wrapper
res = func(self, *args, **kwargs)
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torchbearer\trial.py", line 988, in run
final_metrics = self._fit_pass(state)[torchbearer.METRICS]
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torchbearer\trial.py", line 298, in wrapper
res = func(self, *args, **kwargs)
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torchbearer\trial.py", line 1033, in _fit_pass
state[torchbearer.OPTIMIZER].step(lambda: self.closure(state))
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torch\optim\adam.py", line 58, in step
loss = closure()
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torchbearer\trial.py", line 1033, in <lambda>
state[torchbearer.OPTIMIZER].step(lambda: self.closure(state))
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torchbearer\bases.py", line 382, in closure
state[loss].backward(**state[torchbearer.BACKWARD_ARGS])
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\comet_ml\monkey_patching.py", line 246, in wrapper
return_value = original(*args, **kwargs)
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torch\tensor.py", line 195, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "C:\Users\Francesco\Anaconda3\envs\dl\lib\site-packages\torch\autograd\__init__.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`
Do your library work for you? Do you use it in your daily workflow?
Thank you.
Cheers,
Francesco Saverio