I have trained a simple model by IWSLT'14 German to English, when I use 'paragen-export --config configs/export.yaml' to export it , but error. Could you help me? thank you.
/root/anaconda3/envs/python38/lib/python3.8/site-packages/horovod/common/util.py:258: UserWarning: Framework pytorch installed with version 1.11.0+cu102 but found version 1.11.0.
This can result in unexpected behavior including runtime errors.
Reinstall Horovod using pip install --no-cache-dir
to build with the new version.
warnings.warn(get_version_mismatch_message(name, version, installed_version))
2022-06-07,03:32:52 | INFO | paragen.utils.runtime | Create environment with
{
"debug": false,
"device": "cuda",
"distributed_world": 1,
"fp16": false,
"local_rank": 0,
"no_progress_bar": false,
"no_warning": false,
"pb_interval": 1,
"profiling_window": 0,
"rank": 0
}
2022-06-07,03:32:52 | INFO | paragen.utils.registry | Creating TranslationTask class with configs
{
"data": {
"test": {
"class": "ParallelTextDataset",
"path": {
"de": "data/test.de",
"en": "data/test.en"
},
"sort_samples": true
}
},
"dataloader": {
"test": {
"class": "InMemoryDataLoader",
"sampler": {
"class": "SequentialSampler",
"max_tokens": 8000
}
}
},
"generator": {
"class": "SequenceGenerator",
"search": {
"class": "GreedySearch",
"maxlen_coef": "(1.2, 10)"
}
},
"mode": "evaluate",
"model": {
"class": "Seq2Seq",
"d_model": 512,
"decoder": {
"activation": "relu",
"class": "TransformerDecoder",
"d_model": 512,
"dim_feedforward": 1024,
"dropout": 0.1,
"n_head": 4,
"num_layers": 6
},
"encoder": {
"activation": "relu",
"class": "TransformerEncoder",
"d_model": 512,
"dim_feedforward": 1024,
"dropout": 0.1,
"n_head": 4,
"num_layers": 6
},
"path": "checkpoints/best.pt",
"share_embedding": "decoder-input-output"
},
"src": "de",
"tgt": "en",
"tokenizer": {
"add_bos": true,
"add_eos": true,
"class": "FastBPE",
"vocab": "resources/vocabulary/vocab"
}
}
2022-06-07,03:32:52 | INFO | mosestokenizer.detokenizer.MosesDetokenizer | executing argv ['perl', '/root/anaconda3/envs/python38/lib/python3.8/site-packages/mosestokenizer/detokenizer.perl', '-q', '-b', '-l', 'de']
2022-06-07,03:32:52 | INFO | mosestokenizer.detokenizer.MosesDetokenizer | spawned process 2867
2022-06-07,03:32:52 | INFO | mosestokenizer.detokenizer.MosesDetokenizer | executing argv ['perl', '/root/anaconda3/envs/python38/lib/python3.8/site-packages/mosestokenizer/detokenizer.perl', '-q', '-b', '-l', 'en']
2022-06-07,03:32:52 | INFO | mosestokenizer.detokenizer.MosesDetokenizer | spawned process 2868
2022-06-07,03:32:52 | INFO | paragen.utils.registry | Creating FastBPE class with configs
{
"add_bos": true,
"add_eos": true,
"vocab": "resources/vocabulary/vocab"
}
2022-06-07,03:32:52 | INFO | paragen.tokenizers.vocabulary | build vocab from frequency file resources/vocabulary/vocab
2022-06-07,03:32:52 | INFO | paragen.utils.registry | Creating ParallelTextDataset class with configs
{
"path": {
"de": "data/test.de",
"en": "data/test.en"
},
"sort_samples": true
}
2022-06-07,03:32:59 | INFO | paragen.datasets.parallel_text_dataset | Totally accept 6750 samples, discard 0 samples
2022-06-07,03:32:59 | INFO | paragen.utils.registry | Creating Seq2Seq class with configs
{
"d_model": 512,
"decoder": {
"activation": "relu",
"class": "TransformerDecoder",
"d_model": 512,
"dim_feedforward": 1024,
"dropout": 0.1,
"n_head": 4,
"num_layers": 6
},
"encoder": {
"activation": "relu",
"class": "TransformerEncoder",
"d_model": 512,
"dim_feedforward": 1024,
"dropout": 0.1,
"n_head": 4,
"num_layers": 6
},
"path": "checkpoints/best.pt",
"share_embedding": "decoder-input-output"
}
2022-06-07,03:33:00 | INFO | paragen.utils.registry | Creating TransformerEncoder class with configs
{
"activation": "relu",
"d_model": 512,
"dim_feedforward": 1024,
"dropout": 0.1,
"n_head": 4,
"num_layers": 6
}
2022-06-07,03:33:00 | INFO | paragen.utils.registry | Creating TransformerDecoder class with configs
{
"activation": "relu",
"d_model": 512,
"dim_feedforward": 1024,
"dropout": 0.1,
"n_head": 4,
"num_layers": 6
}
2022-06-07,03:33:00 | INFO | paragen.models.abstract_model | neural network architecture
[TransformerEncoder(
(_embed): Embedding(10004, 512, padding_idx=3)
(_pos_embed): SinusoidalPositionalEmbedding(
(embedding): __SinusoidalPositionalEmbedding()
)
(_embed_dropout): Dropout(p=0.1, inplace=False)
(_layers): ModuleList(
(0): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(1): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(2): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(3): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(4): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(5): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
)
), TransformerDecoder(
(_embed): Embedding(10004, 512, padding_idx=3)
(_pos_embed): SinusoidalPositionalEmbedding(
(embedding): __SinusoidalPositionalEmbedding()
)
(_embed_dropout): Dropout(p=0.1, inplace=False)
(_layers): ModuleList(
(0): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(multihead_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(1): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(multihead_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(2): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(multihead_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(3): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(multihead_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(4): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(multihead_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(5): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ffn): FFN(
(_fc1): Linear(in_features=512, out_features=1024, bias=True)
(_fc2): Linear(in_features=1024, out_features=512, bias=True)
)
(self_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(multihead_attn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(ffn_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
)
(_out_proj): Linear(in_features=512, out_features=10004, bias=False)
)]
2022-06-07,03:33:00 | INFO | paragen.models.abstract_model | parameter size: 41787392
2022-06-07,03:33:00 | INFO | paragen.models.abstract_model | load model from checkpoints/best.pt
2022-06-07,03:33:14 | INFO | paragen.models.abstract_model | keys IN this model but NOT IN loaded model >>>
2022-06-07,03:33:14 | INFO | paragen.models.abstract_model | - None
2022-06-07,03:33:14 | INFO | paragen.models.abstract_model | keys NOT IN this model but IN loaded model >>>
2022-06-07,03:33:14 | INFO | paragen.models.abstract_model | - None
2022-06-07,03:33:14 | INFO | paragen.models.abstract_model | move model to cuda
2022-06-07,03:33:14 | INFO | paragen.utils.registry | Creating SequenceGenerator class with configs
{
"search": {
"class": "GreedySearch",
"maxlen_coef": "(1.2, 10)"
}
}
2022-06-07,03:33:14 | INFO | paragen.utils.registry | Creating GreedySearch class with configs
{
"maxlen_coef": "(1.2, 10)"
}
2022-06-07,03:33:14 | INFO | paragen.generators.abstract_generator | move model to cuda
2022-06-07,03:33:14 | INFO | paragen.utils.registry | Creating SequentialSampler class with configs
{
"max_tokens": 8000
}
2022-06-07,03:33:14 | INFO | paragen.utils.registry | Creating InMemoryDataLoader class with configs
{
"post_collate_fn": null
}
2022-06-07,03:33:14 | INFO | paragen.samplers.abstract_sampler | Deriving total 1 batches
Traceback (most recent call last):
File "/root/anaconda3/envs/python38/bin/paragen-export", line 33, in
sys.exit(load_entry_point('paragen', 'console_scripts', 'paragen-export')())
File "/root/ParaGen/paragen/entries/export.py", line 14, in main
task.export(path, **export_conf)
File "/root/ParaGen/paragen/tasks/translation_task.py", line 263, in export
_fetch_first_sample(),
File "/root/ParaGen/paragen/tasks/translation_task.py", line 259, in _fetch_first_sample
for sample in dataloader:
File "/root/ParaGen/paragen/dataloaders/in_memory_dataloader.py", line 116, in iter
for samples in super().iter():
File "/root/anaconda3/envs/python38/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 530, in next
data = self._next_data()
File "/root/anaconda3/envs/python38/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 570, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/root/anaconda3/envs/python38/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/root/ParaGen/paragen/tasks/translation_task.py", line 180, in _collate
src = convert_idx_to_tensor(src,
File "/root/ParaGen/paragen/utils/tensor.py", line 49, in convert_idx_to_tensor
idx = create_tensor(idx, tensor_type)
File "/root/ParaGen/paragen/utils/tensor.py", line 134, in create_tensor
T = torch.LongTensor(idx)
ValueError: expected sequence of length 4 at dim 2 (got 5)
2022-06-07,03:33:15 | INFO | mosestokenizer.detokenizer.MosesDetokenizer | killing process 2867
2022-06-07,03:33:15 | INFO | mosestokenizer.detokenizer.MosesDetokenizer | killing process 2868