I was trying to train swin transformer on HMDB51 dataset.
While I referred to "https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/configs/recognition/tsn/tsn_r50_1x1x8_50e_hmdb51_imagenet_rgb.py" configuration file and "README" files, what I got was an error message.
Here is my configuration file.
`base = [
'../../base/models/swin/swin_base.py', '../../base/default_runtime.py'
]
model=dict(backbone=dict(patch_size=(2,4,4), drop_path_rate=0.3), test_cfg=dict(max_testing_views=4), cls_head=dict(num_classes=174))
dataset settings
dataset_type = 'RawframeDataset'
data_root = 'data/hmdb51/rawframes'
data_root_val = 'data/hmdb51/rawframes'
ann_file_train = 'data/hmdb51/hmdb51_train_split_1_rawframes.txt'
ann_file_val = 'data/hmdb51/hmdb51_val_split_1_rawframes.txt'
ann_file_test = 'data/hmdb51/hmdb51_val_split_1_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Flip', flip_ratio=0),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Flip', flip_ratio=0),
dict(type='Normalize', **img_norm_cfg),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=2,
workers_per_gpu=1,
val_dataloader=dict(
videos_per_gpu=1,
workers_per_gpu=1
),
test_dataloader=dict(
videos_per_gpu=1,
workers_per_gpu=1
),
train=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=data_root,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=data_root_val,
pipeline=val_pipeline),
test=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=data_root_val,
pipeline=test_pipeline))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
optimizer
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05,
paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1)}))
learning policy
lr_config = dict(
policy='CosineAnnealing',
min_lr=0,
warmup='linear',
warmup_by_epoch=True,
warmup_iters=2.5
)
total_epochs = 30
runtime settings
checkpoint_config = dict(interval=1)
work_dir = './work_dirs/hmdb51_swin_base_patch244_window877.py'
find_unused_parameters = False
do not use mmdet version fp16
fp16 = None
optimizer_config = dict(
type="DistOptimizerHook",
update_interval=8,
grad_clip=None,
coalesce=True,
bucket_size_mb=-1,
use_fp16=True,
)
`
I tried to execute code by "python tools/train.py configs/recognition/swin/swin_base_patch244_window877_hmdb51.py" failed.
And following error log is given.
- root@c265ec69239e:/home/Video-Swin-Transformer# python tools/train.py configs/recognition/swin/swin_base_patch244_window877_hmdb51.py
2022-07-07 05:46:41,569 - mmaction - INFO - Environment info:
sys.platform: linux
Python: 3.7.7 (default, Mar 26 2020, 15:48:22) [GCC 7.3.0]
CUDA available: True
GPU 0: Tesla V100-PCIE-32GB
CUDA_HOME: /usr/local/cuda
NVCC: Build cuda_11.0_bu.TC445_37.28845127_0
GCC: gcc (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609
PyTorch: 1.7.1
PyTorch compiling details: PyTorch built with:
- GCC 7.3
- C++ Version: 201402
- Intel(R) oneAPI Math Kernel Library Version 2021.2-Product Build 20210312 for Intel(R) 64 architecture applications
- Intel(R) MKL-DNN v1.6.0 (Git Hash 5ef631a030a6f73131c77892041042805a06064f)
- OpenMP 201511 (a.k.a. OpenMP 4.5)
- NNPACK is enabled
- CPU capability usage: AVX2
- CUDA Runtime 11.0
- NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_37,code=compute_37
- CuDNN 8.0.5
- Magma 2.5.2
- Build settings: BLAS=MKL, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_VULKAN_WRAPPER -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON,
TorchVision: 0.8.2
OpenCV: 3.4.2
MMCV: 1.3.3
MMCV Compiler: GCC 5.4
MMCV CUDA Compiler: not available
MMAction2: 0.15.0+db018fb
2022-07-07 05:46:41,570 - mmaction - INFO - Distributed training: False
2022-07-07 05:46:42,085 - mmaction - INFO - Config: model = dict(
type='Recognizer3D',
backbone=dict(
type='SwinTransformer3D',
patch_size=(2, 4, 4),
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=(8, 7, 7),
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.3,
patch_norm=True),
cls_head=dict(
type='I3DHead',
in_channels=1024,
num_classes=174,
spatial_type='avg',
dropout_ratio=0.5),
test_cfg=dict(average_clips='prob', max_testing_views=4))
checkpoint_config = dict(interval=1)
log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
dataset_type = 'RawframeDataset'
data_root = 'data/hmdb51/rawframes'
data_root_val = 'data/hmdb51/rawframes'
ann_file_train = 'data/hmdb51/hmdb51_train_split_1_rawframes.txt'
ann_file_val = 'data/hmdb51/hmdb51_val_split_1_rawframes.txt'
ann_file_test = 'data/hmdb51/hmdb51_val_split_1_rawframes.txt'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_bgr=False),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Flip', flip_ratio=0),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_bgr=False),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Flip', flip_ratio=0),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_bgr=False),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=2,
workers_per_gpu=1,
val_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
test_dataloader=dict(videos_per_gpu=1, workers_per_gpu=1),
train=dict(
type='RawframeDataset',
ann_file='data/hmdb51/hmdb51_train_split_1_rawframes.txt',
data_prefix='data/hmdb51/rawframes',
pipeline=[
dict(
type='SampleFrames', clip_len=1, frame_interval=1,
num_clips=8),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_bgr=False),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]),
val=dict(
type='RawframeDataset',
ann_file='data/hmdb51/hmdb51_val_split_1_rawframes.txt',
data_prefix='data/hmdb51/rawframes',
pipeline=[
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Flip', flip_ratio=0),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_bgr=False),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]),
test=dict(
type='RawframeDataset',
ann_file='data/hmdb51/hmdb51_val_split_1_rawframes.txt',
data_prefix='data/hmdb51/rawframes',
pipeline=[
dict(
type='SampleFrames',
clip_len=1,
frame_interval=1,
num_clips=8,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=256),
dict(type='Flip', flip_ratio=0),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_bgr=False),
dict(type='FormatShape', input_format='NCHW'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
optimizer = dict(
type='AdamW',
lr=0.001,
betas=(0.9, 0.999),
weight_decay=0.05,
paramwise_cfg=dict(
custom_keys=dict(
absolute_pos_embed=dict(decay_mult=0.0),
relative_position_bias_table=dict(decay_mult=0.0),
norm=dict(decay_mult=0.0),
backbone=dict(lr_mult=0.1))))
lr_config = dict(
policy='CosineAnnealing',
min_lr=0,
warmup='linear',
warmup_by_epoch=True,
warmup_iters=2.5)
total_epochs = 30
work_dir = './work_dirs/hmdb51_swin_base_patch244_window877.py'
find_unused_parameters = False
gpu_ids = range(0, 1)
omnisource = False
module_hooks = []
2022-07-07 05:46:49,300 - mmaction - INFO - Start running, host: root@c265ec69239e, work_dir: /home/Video-Swin-Transformer/work_dirs/hmdb51_swin_base_patch244_window877.py
2022-07-07 05:46:49,300 - mmaction - INFO - workflow: [('train', 1)], max: 30 epochs
Traceback (most recent call last):
File "tools/train.py", line 200, in
main()
File "tools/train.py", line 196, in main
meta=meta)
File "/home/Video-Swin-Transformer/mmaction/apis/train.py", line 195, in train_model
runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 125, in run
epoch_runner(data_loaders[i], **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train
self.run_iter(data_batch, train_mode=True, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 30, in run_iter
**kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 67, in train_step
return self.module.train_step(*inputs[0], **kwargs[0])
File "/home/Video-Swin-Transformer/mmaction/models/recognizers/base.py", line 294, in train_step
losses = self(imgs, label, return_loss=True, **aux_info)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/Video-Swin-Transformer/mmaction/models/recognizers/base.py", line 256, in forward
return self.forward_train(imgs, label, **kwargs)
File "/home/Video-Swin-Transformer/mmaction/models/recognizers/recognizer3d.py", line 19, in forward_train
x = self.extract_feat(imgs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 95, in new_func
return old_func(*args, **kwargs)
File "/home/Video-Swin-Transformer/mmaction/models/recognizers/base.py", line 157, in extract_feat
x = self.backbone(imgs)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/Video-Swin-Transformer/mmaction/models/backbones/swin_transformer.py", line 652, in forward
x = self.patch_embed(x)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/Video-Swin-Transformer/mmaction/models/backbones/swin_transformer.py", line 441, in forward
_, _, D, H, W = x.size()
ValueError: not enough values to unpack (expected 5, got 4)
PLEASE SOMEBODY HELP ME!!!