Hello, the author. The following errors occurred during the training of my coco format dataset. What are the causes and how to solve them? Thank you very much.
2022-06-06 22:11:33,374 - mmseg - INFO - Iter [50/160000] lr: 9.997e-03, eta: 11:50:52, time: 0.267, data_time: 0.007, memory: 828, decode.loss_seg: 0.1570, decode.acc_seg: 94.3870, loss: 0.1570
2022-06-06 22:11:44,482 - mmseg - INFO - Iter [100/160000] lr: 9.994e-03, eta: 10:51:19, time: 0.222, data_time: 0.003, memory: 828, decode.loss_seg: 0.1489, decode.acc_seg: 94.5770, loss: 0.1489
2022-06-06 22:11:55,525 - mmseg - INFO - Iter [150/160000] lr: 9.992e-03, eta: 10:30:11, time: 0.221, data_time: 0.003, memory: 828, decode.loss_seg: 0.1607, decode.acc_seg: 94.2994, loss: 0.1607
2022-06-06 22:11:57,747 - mmseg - INFO - Saving checkpoint at 160 iterations
[ ] 0/94768, elapsed: 0s, ETA:Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, **data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer.rebuild_buckets():
RuntimeError: replicas[0].size() == rebuilt_param_indices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, **data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer.rebuild_buckets():
RuntimeError: replicas[0].size() == rebuilt_param_indices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, **data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer.rebuild_buckets():
RuntimeError: replicas[0].size() == rebuilt_param_indices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, **data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in call_impl
result = self.forward(*input, **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer.rebuild_buckets():
RuntimeError: replicas[0].size() == rebuilt_param_indices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/distributed/launch.py", line 260, in
main()
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/distributed/launch.py", line 256, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/data/home/scv4589/.conda/envs/bpr/bin/python', '-u', 'tools/train.py', '--local_rank=3', 'configs/bpr/hrnet18s_128.py', '--launcher', 'pytorch']' returned non-zero exit status 1.