Hi,
Thanks for your excellent work. But I encountered an issue when running
./scripts/bindiff.sh
training environment
torch 1.10.0+cu111
torchaudio 0.10.0+cu111
torchvision 0.11.0+cu111
GPU: RTX 2080ti
RAM: 11GB
- no argument "use_pooling" in @register_criterion('poj_similarity')
@register_criterion('poj_similarity')
class PojSimilarityLoss(FairseqCriterion):
def __init__(self, args, task):
super().__init__(args, task)
self.inst_padding_idx = task.instruction_dictionary.pad()
self.state_padding_idx = task.state_dictionary.pad()
self.task = task
self.args = args
def forward(self, model, sample, reduce=True, train=True):
no_state = self.args.no_state
no_pce = self.args.no_pce
pooling = self.args.use_pooling
output = model(**sample['net_input'], masked_tokens=None, features_only=True, moco_head=False,
moco_head_only_proj=False, lm_head=False, classification_head_name=None,
has_state=not no_state, has_pce=not no_pce, pooling_instruction=pooling)
after changing this to
#pooling = self.args.use_pooling
pooling = self.args.no_pooling
got another error:
2. multiple values for keyword "has_pce"
File "/mnt/g/Projects/OSCAR/model/fairseq/models/irbert/model.py", line 92, in forward
x, extra = self.decoder(src, features_only, return_all_hiddens, moco_head=moco_head, has_state=has_state,
TypeError: IRBertEncoder object got multiple values for keyword argument 'has_pce‘
after removing this keyword, got another error:
3. got multiple values for keyword argument 'pooling_instruction'
File "/mnt/g/Projects/OSCAR/model/fairseq/models/irbert/model.py", line 92, in forward
x, extra = self.decoder(src, features_only, return_all_hiddens, moco_head=moco_head, has_state=has_state,
TypeError: IRBertEncoder object got multiple values for keyword argument 'pooling_instruction'
after removing this, got another error too:
add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1050.)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
Traceback (most recent call last):
File "train.py", line 356, in <module>
cli_main()
File "train.py", line 321, in cli_main
main(args)
File "train.py", line 95, in main
train(args, trainer, task, epoch_itr)
File "train.py", line 139, in train
log_output = trainer.train_step(samples)
File "/OSCAR/model/fairseq/trainer.py", line 346, in train_step
raise e
File "/Projects/OSCAR/model/fairseq/trainer.py", line 309, in train_step
loss, sample_size, logging_output = self.task.train_step(
File "/OSCAR/model/fairseq/tasks/fairseq_task.py", line 248, in train_step
optimizer.backward(loss)
File "/OSCAR/model/fairseq/optim/fp16_optimizer.py", line 103, in backward
loss.backward()
File "/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
Variable._execution_engine.run_backward(
RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
I don't know how to fix this. Can you tell me what did I miss? Thanks for your time in advance.