Pytorch 1.8
torchversion 0.9.0
CUDA 10.1
when train for ytb it report this error
here is the full track
(torch18) cwc@imc-Z9PE-D8-WS:~/aot-benchmark-main/tools$ python train.py
Exp _AOTT:
{
"DATASETS": [
"youtubevos"
],
"DATA_DAVIS_REPEAT": 5,
"DATA_DYNAMIC_MERGE_PROB": 0.3,
"DATA_MAX_CROP_STEPS": 10,
"DATA_MAX_SCALE_FACTOR": 1.3,
"DATA_MIN_SCALE_FACTOR": 0.7,
"DATA_RANDOMCROP": [
465,
465
],
"DATA_RANDOMFLIP": 0.5,
"DATA_RANDOM_GAP_DAVIS": 12,
"DATA_RANDOM_GAP_YTB": 3,
"DATA_RANDOM_REVERSE_SEQ": true,
"DATA_SEQ_LEN": 5,
"DATA_SHORT_EDGE_LEN": 480,
"DATA_WORKERS": 8,
"DIR_CKPT": "./results/result/_AOTT/YTB/ckpt",
"DIR_DAVIS": "/DATACENTER/1/ysl/Datasets/DAVIS/2017",
"DIR_EMA_CKPT": "./results/result/_AOTT/YTB/ema_ckpt",
"DIR_EVALUATION": "./results/result/_AOTT/YTB/eval",
"DIR_IMG_LOG": "./results/result/_AOTT/YTB/log/img",
"DIR_LOG": "./results/result/_AOTT/YTB/log",
"DIR_RESULT": "./results/result/_AOTT/YTB",
"DIR_ROOT": "./results",
"DIR_STATIC": "/DATACENTER/1/Datasets/static",
"DIR_TB_LOG": "./results/result/_AOTT/YTB/log/tensorboard",
"DIR_YTB": "/DATACENTER/1/ysl/Datasets/YoutubeVOS",
"DIST_BACKEND": "nccl",
"DIST_ENABLE": true,
"DIST_START_GPU": 1,
"DIST_URL": "tcp://127.0.0.1:12311",
"EXP_NAME": "_AOTT",
"MODEL_ALIGN_CORNERS": true,
"MODEL_ATT_HEADS": 8,
"MODEL_DECODER_INTERMEDIATE_LSTT": true,
"MODEL_ENCODER": "mobilenetv2",
"MODEL_ENCODER_DIM": [
24,
32,
96,
1280
],
"MODEL_ENCODER_EMBEDDING_DIM": 256,
"MODEL_ENCODER_PRETRAIN": "/home/cwc/aot-benchmark-main/pretrain_models/mobilenet_v2-b0353104.pth",
"MODEL_ENGINE": "aotengine",
"MODEL_EPSILON": 1e-05,
"MODEL_FREEZE_BACKBONE": false,
"MODEL_FREEZE_BN": true,
"MODEL_LSTT_NUM": 1,
"MODEL_MAX_OBJ_NUM": 10,
"MODEL_NAME": "AOTT",
"MODEL_SELF_HEADS": 8,
"MODEL_USE_PREV_PROB": false,
"MODEL_VOS": "aot",
"PRETRAIN": true,
"PRETRAIN_FULL": false,
"PRETRAIN_MODEL": "",
"STAGE_NAME": "YTB",
"TEST_CKPT_PATH": null,
"TEST_CKPT_STEP": null,
"TEST_DATASET": "youtubevos",
"TEST_DATASET_FULL_RESOLUTION": false,
"TEST_DATASET_SPLIT": "val",
"TEST_FLIP": false,
"TEST_FRAME_LOG": false,
"TEST_GPU_ID": 1,
"TEST_GPU_NUM": 1,
"TEST_LONG_TERM_MEM_GAP": 9999,
"TEST_MAX_SIZE": 1040.0,
"TEST_MIN_SIZE": null,
"TEST_MULTISCALE": [
1
],
"TEST_WORKERS": 4,
"TRAIN_AUTO_RESUME": true,
"TRAIN_AUX_LOSS_RATIO": 1.0,
"TRAIN_AUX_LOSS_WEIGHT": 1.0,
"TRAIN_BATCH_SIZE": 4,
"TRAIN_CLIP_GRAD_NORM": 5.0,
"TRAIN_DATASET_FULL_RESOLUTION": false,
"TRAIN_EMA_RATIO": 0.1,
"TRAIN_ENABLE_PREV_FRAME": false,
"TRAIN_ENCODER_FREEZE_AT": 2,
"TRAIN_GPUS": 2,
"TRAIN_HARD_MINING_RATIO": 0.5,
"TRAIN_IMG_LOG": true,
"TRAIN_LOG_STEP": 20,
"TRAIN_LONG_TERM_MEM_GAP": 9999,
"TRAIN_LR": 0.0002,
"TRAIN_LR_COSINE_DECAY": false,
"TRAIN_LR_ENCODER_RATIO": 0.1,
"TRAIN_LR_MIN": 2e-05,
"TRAIN_LR_POWER": 0.9,
"TRAIN_LR_RESTART": 1,
"TRAIN_LR_UPDATE_STEP": 1,
"TRAIN_LR_WARM_UP_RATIO": 0.05,
"TRAIN_LSTT_DROPPATH": 0.1,
"TRAIN_LSTT_DROPPATH_LST": false,
"TRAIN_LSTT_DROPPATH_SCALING": false,
"TRAIN_LSTT_EMB_DROPOUT": 0.0,
"TRAIN_LSTT_ID_DROPOUT": 0.0,
"TRAIN_LSTT_LT_DROPOUT": 0.0,
"TRAIN_LSTT_ST_DROPOUT": 0.0,
"TRAIN_MAX_KEEP_CKPT": 8,
"TRAIN_OPT": "adamw",
"TRAIN_RESUME": false,
"TRAIN_RESUME_CKPT": null,
"TRAIN_RESUME_STEP": 0,
"TRAIN_SAVE_STEP": 1000,
"TRAIN_SEQ_TRAINING_FREEZE_PARAMS": [
"patch_wise_id_bank"
],
"TRAIN_SEQ_TRAINING_START_RATIO": 0.5,
"TRAIN_SGD_MOMENTUM": 0.9,
"TRAIN_START_STEP": 0,
"TRAIN_TBLOG": false,
"TRAIN_TBLOG_STEP": 50,
"TRAIN_TOP_K_PERCENT_PIXELS": 0.15,
"TRAIN_TOTAL_STEPS": 100000,
"TRAIN_WEIGHT_DECAY": 0.07,
"TRAIN_WEIGHT_DECAY_EXCLUSIVE": {},
"TRAIN_WEIGHT_DECAY_EXEMPTION": [
"absolute_pos_embed",
"relative_position_bias_table",
"relative_emb_v",
"conv_out"
]
}
Use GPU 1 for training VOS.
Build VOS model.
Use GPU 2 for training VOS.
Use Frozen BN in Encoder!
Total Param: 5.73M
Build optimizer.
Total Param: 5.73M
Process dataset...
Short object: 721bb6f2cb-3
Short object: 721bb6f2cb-3
Short object: d177e9878a-2
Short object: d177e9878a-3
Short object: d177e9878a-2
Short object: d177e9878a-3
Short object: f36483c824-2
Short object: f9bd1fabf5-4
Short object: f36483c824-2
Video Num: 3471 X 1
Done!
Short object: f9bd1fabf5-4
Video Num: 3471 X 1
Remove ['features.0.1.num_batches_tracked', 'features.1.conv.0.1.num_batches_tracked', 'features.1.conv.2.num_batches_tracked', 'features.2.conv.0.1.num_batches_tracked', 'features.2.conv.1.1.num_batches_tracked', 'features.2.conv.3.num_batches_tracked', 'features.3.conv.0.1.num_batches_tracked', 'features.3.conv.1.1.num_batches_tracked', 'features.3.conv.3.num_batches_tracked', 'features.4.conv.0.1.num_batches_tracked', 'features.4.conv.1.1.num_batches_tracked', 'features.4.conv.3.num_batches_tracked', 'features.5.conv.0.1.num_batches_tracked', 'features.5.conv.1.1.num_batches_tracked', 'features.5.conv.3.num_batches_tracked', 'features.6.conv.0.1.num_batches_tracked', 'features.6.conv.1.1.num_batches_tracked', 'features.6.conv.3.num_batches_tracked', 'features.7.conv.0.1.num_batches_tracked', 'features.7.conv.1.1.num_batches_tracked', 'features.7.conv.3.num_batches_tracked', 'features.8.conv.0.1.num_batches_tracked', 'features.8.conv.1.1.num_batches_tracked', 'features.8.conv.3.num_batches_tracked', 'features.9.conv.0.1.num_batches_tracked', 'features.9.conv.1.1.num_batches_tracked', 'features.9.conv.3.num_batches_tracked', 'features.10.conv.0.1.num_batches_tracked', 'features.10.conv.1.1.num_batches_tracked', 'features.10.conv.3.num_batches_tracked', 'features.11.conv.0.1.num_batches_tracked', 'features.11.conv.1.1.num_batches_tracked', 'features.11.conv.3.num_batches_tracked', 'features.12.conv.0.1.num_batches_tracked', 'features.12.conv.1.1.num_batches_tracked', 'features.12.conv.3.num_batches_tracked', 'features.13.conv.0.1.num_batches_tracked', 'features.13.conv.1.1.num_batches_tracked', 'features.13.conv.3.num_batches_tracked', 'features.14.conv.0.1.num_batches_tracked', 'features.14.conv.1.1.num_batches_tracked', 'features.14.conv.3.num_batches_tracked', 'features.15.conv.0.1.num_batches_tracked', 'features.15.conv.1.1.num_batches_tracked', 'features.15.conv.3.num_batches_tracked', 'features.16.conv.0.1.num_batches_tracked', 'features.16.conv.1.1.num_batches_tracked', 'features.16.conv.3.num_batches_tracked', 'features.17.conv.0.1.num_batches_tracked', 'features.17.conv.1.1.num_batches_tracked', 'features.17.conv.3.num_batches_tracked', 'features.18.1.num_batches_tracked', 'classifier.1.weight', 'classifier.1.bias'] from pretrained model.
Load pretrained backbone model from .
Start training:
step------------------------------ 0
step------------------------------ 0
[W reducer.cpp:1050] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters, consider turning this flag off. Note that this warning may be a false positive your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1050] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters, consider turning this flag off. Note that this warning may be a false positive your model has flow control causing later iterations to have unused parameters. (function operator())
Traceback (most recent call last):
File "train.py", line 80, in
main()
File "train.py", line 76, in main
mp.spawn(main_worker, nprocs=cfg.TRAIN_GPUS, args=(cfg, args.amp))
File "/home/cwc/anaconda3/envs/torch18/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/cwc/anaconda3/envs/torch18/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/cwc/anaconda3/envs/torch18/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/cwc/anaconda3/envs/torch18/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/cwc/aot-benchmark-main/tools/train.py", line 18, in main_worker
trainer.sequential_training()
File "/home/cwc/aot-benchmark-main/tools/../networks/managers/trainer.py", line 456, in sequential_training
loss.backward()
File "/home/cwc/anaconda3/envs/torch18/lib/python3.8/site-packages/torch/tensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/cwc/anaconda3/envs/torch18/lib/python3.8/site-packages/torch/autograd/init.py", line 145, in backward
Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [900, 2, 256]], which is output 0 of AddBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).