Train command
%cd /home/ec2-user/SageMaker/SDR
!python sdr_main.py --dataset_name wines
Stacktrace:
Traceback (most recent call last):
File "sdr_main.py", line 80, in <module>
main()
File "sdr_main.py", line 28, in main
main_train(model_class_pointer, hyperparams,parser)
File "sdr_main.py", line 72, in main_train
trainer.fit(model)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
results = self.accelerator_backend.train()
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 57, in train
return self.train_or_test()
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
self.train_loop.run_training_epoch()
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 550, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 692, in run_training_batch
self.trainer.hiddens)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 806, in training_step_and_backward
result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 319, in training_step
training_step_output = self.trainer.accelerator_backend.training_step(args)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/accelerators/dp_accelerator.py", line 117, in training_step
return self._step(args)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/accelerators/dp_accelerator.py", line 113, in _step
output = self.trainer.model(*args)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pytorch_lightning/overrides/data_parallel.py", line 93, in forward
return self.module.training_step(*inputs[0], **kwargs[0])
File "/home/ec2-user/SageMaker/SDR/models/doc_similarity_pl_template.py", line 49, in training_step
batch = self(batch)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/SageMaker/SDR/models/SDR/SDR.py", line 78, in forward
eval(f"self.forward_{self.hparams.mode}")(batch)
File "/home/ec2-user/SageMaker/SDR/models/SDR/SDR.py", line 48, in forward_train
run_mlm=True,
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/SageMaker/SDR/models/SDR/similarity_modeling.py", line 129, in forward
return_dict=return_dict,
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 835, in forward
return_dict=return_dict,
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 490, in forward
output_attentions,
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 433, in forward
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_utils.py", line 1597, in apply_chunking_to_forward
return forward_fn(*input_tensors)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 439, in feed_forward_chunk
intermediate_output = self.intermediate(attention_output)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 367, in forward
hidden_states = self.intermediate_act_fn(hidden_states)
File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/functional.py", line 1556, in gelu
return torch._C._nn.gelu(input)
RuntimeError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 14.76 GiB total capacity; 11.17 GiB already allocated; 14.75 MiB free; 11.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF