Congratulations for the great work. I am getting this error while trying to train with 4 gpus. Can you please help me out?
File "/data/SoftTeacher/tools/train.py", line 198, in <module>
main()
File "/data/SoftTeacher/tools/train.py", line 186, in main
train_detector(
File "/data/SoftTeacher/ssod/apis/train.py", line 206, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 238, in train_step
losses = self(**data)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 128, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 172, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SoftTeacher/ssod/models/soft_teacher.py", line 44, in forward_train
sup_loss = self.student.forward_train(**data_groups["sup"])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/two_stage.py", line 135, in forward_train
rpn_losses, proposal_list = self.rpn_head.forward_train(
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/base_dense_head.py", line 59, in forward_train
proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 214, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 152, in get_bboxes
proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 244, in _get_bboxes_single
dets, keep = batched_nms(proposals, scores, ids, cfg.nms)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 307, in batched_nms
dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/utils/misc.py", line 330, in new_func
output = old_func(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 171, in nms
inds = NMSop.apply(boxes, scores, iou_threshold, offset,
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 26, in forward
inds = ext_module.nms(
RuntimeError: CUDA error: invalid device function
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
/data/SoftTeacher/thirdparty/mmdetection/mmdet/core/anchor/anchor_generator.py:324: UserWarning: ``grid_anchors`` would be deprecated soon. Please use ``grid_priors``
warnings.warn('``grid_anchors`` would be deprecated soon. '
/data/SoftTeacher/thirdparty/mmdetection/mmdet/core/anchor/anchor_generator.py:360: UserWarning: ``single_level_grid_anchors`` would be deprecated soon. Please use ``single_level_grid_priors``
warnings.warn(
Traceback (most recent call last):
File "/data/SoftTeacher/tools/train.py", line 198, in <module>
main()
File "/data/SoftTeacher/tools/train.py", line 186, in main
train_detector(
File "/data/SoftTeacher/ssod/apis/train.py", line 206, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 238, in train_step
losses = self(**data)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 128, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 172, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SoftTeacher/ssod/models/soft_teacher.py", line 44, in forward_train
sup_loss = self.student.forward_train(**data_groups["sup"])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/two_stage.py", line 135, in forward_train
rpn_losses, proposal_list = self.rpn_head.forward_train(
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/base_dense_head.py", line 59, in forward_train
proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 214, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 152, in get_bboxes
proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 244, in _get_bboxes_single
dets, keep = batched_nms(proposals, scores, ids, cfg.nms)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 307, in batched_nms
dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/utils/misc.py", line 330, in new_func
output = old_func(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 171, in nms
inds = NMSop.apply(boxes, scores, iou_threshold, offset,
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 26, in forward
inds = ext_module.nms(
RuntimeError: CUDA error: invalid device function
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
/data/SoftTeacher/thirdparty/mmdetection/mmdet/core/anchor/anchor_generator.py:324: UserWarning: ``grid_anchors`` would be deprecated soon. Please use ``grid_priors``
warnings.warn('``grid_anchors`` would be deprecated soon. '
/data/SoftTeacher/thirdparty/mmdetection/mmdet/core/anchor/anchor_generator.py:360: UserWarning: ``single_level_grid_anchors`` would be deprecated soon. Please use ``single_level_grid_priors``
warnings.warn(
Traceback (most recent call last):
File "/data/SoftTeacher/tools/train.py", line 198, in <module>
main()
File "/data/SoftTeacher/tools/train.py", line 186, in main
train_detector(
File "/data/SoftTeacher/ssod/apis/train.py", line 206, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 238, in train_step
losses = self(**data)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 128, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 172, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SoftTeacher/ssod/models/soft_teacher.py", line 44, in forward_train
sup_loss = self.student.forward_train(**data_groups["sup"])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/two_stage.py", line 135, in forward_train
rpn_losses, proposal_list = self.rpn_head.forward_train(
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/base_dense_head.py", line 59, in forward_train
proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 214, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 152, in get_bboxes
proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 244, in _get_bboxes_single
dets, keep = batched_nms(proposals, scores, ids, cfg.nms)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 307, in batched_nms
dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/utils/misc.py", line 330, in new_func
output = old_func(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 171, in nms
inds = NMSop.apply(boxes, scores, iou_threshold, offset,
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 26, in forward
inds = ext_module.nms(
RuntimeError: CUDA error: invalid device function
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
/data/SoftTeacher/thirdparty/mmdetection/mmdet/core/anchor/anchor_generator.py:324: UserWarning: ``grid_anchors`` would be deprecated soon. Please use ``grid_priors``
warnings.warn('``grid_anchors`` would be deprecated soon. '
/data/SoftTeacher/thirdparty/mmdetection/mmdet/core/anchor/anchor_generator.py:360: UserWarning: ``single_level_grid_anchors`` would be deprecated soon. Please use ``single_level_grid_priors``
warnings.warn(
Traceback (most recent call last):
File "/data/SoftTeacher/tools/train.py", line 198, in <module>
main()
File "/data/SoftTeacher/tools/train.py", line 186, in main
train_detector(
File "/data/SoftTeacher/ssod/apis/train.py", line 206, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 238, in train_step
losses = self(**data)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 128, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/base.py", line 172, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SoftTeacher/ssod/models/soft_teacher.py", line 44, in forward_train
sup_loss = self.student.forward_train(**data_groups["sup"])
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/detectors/two_stage.py", line 135, in forward_train
rpn_losses, proposal_list = self.rpn_head.forward_train(
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/base_dense_head.py", line 59, in forward_train
proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 214, in new_func
output = old_func(*new_args, **new_kwargs)
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 152, in get_bboxes
proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
File "/data/SoftTeacher/thirdparty/mmdetection/mmdet/models/dense_heads/rpn_head.py", line 244, in _get_bboxes_single
dets, keep = batched_nms(proposals, scores, ids, cfg.nms)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 307, in batched_nms
dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/utils/misc.py", line 330, in new_func
output = old_func(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 171, in nms
inds = NMSop.apply(boxes, scores, iou_threshold, offset,
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/mmcv/ops/nms.py", line 26, in forward
inds = ext_module.nms(
RuntimeError: CUDA error: invalid device function
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
wandb: Waiting for W&B process to finish, PID 38162
wandb: Program failed with code 1.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -11) local_rank: 1 (pid: 37922) of binary: /home/ubuntu/anaconda3/envs/py39/bin/python
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/distributed/launch.py", line 193, in <module>
main()
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/distributed/run.py", line 689, in run
elastic_launch(
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 116, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ubuntu/anaconda3/envs/py39/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 244, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
**************************************************
tools/train.py FAILED
==================================================
Root Cause:
[0]:
time: 2021-09-29_15:26:15
rank: 1 (local_rank: 1)
exitcode: -11 (pid: 37922)
error_file: <N/A>
msg: "Signal 11 (SIGSEGV) received by PID 37922"
==================================================
Other Failures:
[1]:
time: 2021-09-29_15:26:15
rank: 3 (local_rank: 3)
exitcode: -11 (pid: 37924)
error_file: <N/A>
msg: "Signal 11 (SIGSEGV) received by PID 37924"
**************************************************