Describe the bug
In interactive mode prompt length more than one word causes to crash.
When I type just one word it generates text though.
text_generation.yml
`
{
"text-gen-type": "interactive",
"maximum_tokens": 500,
"temperature": 0.9,
"top_p": 0,
"top_k": 0,
"recompute": false,
"num-samples": 10,
"sample-input-file": "prompt.txt",
"sample-output-file": "sample_output.txt",
}
`
`Context prompt >>> Hello from
Traceback (most recent call last):
Traceback (most recent call last):
File "generate.py", line 89, in
File "generate.py", line 89, in
main()
File "generate.py", line 72, in main
main()
File "generate.py", line 72, in main
generate_samples_interactive(
File "/gpt-neox/megatron/text_generation_utils.py", line 760, in generate_samples_interactive
generate_samples_interactive(
File "/gpt-neox/megatron/text_generation_utils.py", line 760, in generate_samples_interactive
for (
File "/gpt-neox/megatron/text_generation_utils.py", line 316, in stream_tokens
for (
File "/gpt-neox/megatron/text_generation_utils.py", line 316, in stream_tokens
logits = forward_model(model, model_inputs, neox_args.is_pipe_parallel)
File "/gpt-neox/megatron/text_generation_utils.py", line 156, in forward_model
logits = forward_model(model, model_inputs, neox_args.is_pipe_parallel)
File "/gpt-neox/megatron/text_generation_utils.py", line 156, in forward_model
loss, logits = model.eval_batch(model_inputs, return_logits=True)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/engine.py", line 394, in eval_batch
loss, logits = model.eval_batch(model_inputs, return_logits=True)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/engine.py", line 394, in eval_batch
self._exec_schedule(sched)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/engine.py", line 1308, in _exec_schedule
self._exec_schedule(sched)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/engine.py", line 1308, in _exec_schedule
self._exec_instr(**cmd.kwargs)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/engine.py", line 700, in _exec_forward_pass
self._exec_instr(**cmd.kwargs)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/engine.py", line 700, in _exec_forward_pass
self.loss = self.loss_model(outputs, labels)
File "/gpt-neox/megatron/model/gpt2_model.py", line 67, in cross_entropy
losses = mpu.vocab_parallel_cross_entropy(output.float().contiguous(), labels)
File "/gpt-neox/megatron/mpu/cross_entropy.py", line 117, in vocab_parallel_cross_entropy
self.loss = self.loss_model(outputs, labels)
File "/gpt-neox/megatron/model/gpt2_model.py", line 67, in cross_entropy
return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
File "/gpt-neox/megatron/mpu/cross_entropy.py", line 63, in forward
losses = mpu.vocab_parallel_cross_entropy(output.float().contiguous(), labels)
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] File "/gpt-neox/megatron/mpu/cross_entropy.py", line 117, in vocab_parallel_cross_entropy
IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [2], [3]
return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
File "/gpt-neox/megatron/mpu/cross_entropy.py", line 63, in forward
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [2], [3]
Killing subprocess 7479
Killing subprocess 7480
Killing subprocess 7481
Killing subprocess 7482
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 179, in
main()
File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 169, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 147, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python', '-u', 'generate.py', '--local_rank=3', '--deepspeed_config', '{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 32, "optimizer": {"type": "Adam", "params": {"lr": 9.7e-05, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 1260000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 1260000000, "contiguous_gradients": true}, "steps_per_print": 2}', '--megatron_config', '{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 32, "optimizer": {"type": "Adam", "params": {"lr": 9.7e-05, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 1260000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 1260000000, "contiguous_gradients": true}, "steps_per_print": 2, "precision": "fp16", "num_layers": 44, "hidden_size": 6144, "num_attention_heads": 64, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, "rotary_pct": 0.25, "init_method": "small_init", "output_layer_init_method": "wang_init", "gpt_j_residual": true, "gpt_j_tied": true, "output_layer_parallelism": "column", "lr_decay_style": "cosine", "lr_decay_iters": 150000, "min_lr": 9.7e-06, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 1260000000, "zero_allgather_bucket_size": 1260000000, "lr": 9.7e-05, "tokenizer_type": "HFTokenizer", "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", "data_impl": "mmap", "save": "./20B_checkpoints", "config_files": {"20B.yml": "# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100\n# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit\n# the model in memory.\n\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"vocab-file\": \"./20B_checkpoints/20B_tokenizer.json\",\n \"save\": \"./20B_checkpoints\",\n \"load\": \"./20B_checkpoints\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document\",\n\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"pipe-parallel-size\": 2,\n \"model-parallel-size\": 2,\n\n # model settings\n \"num-layers\": 44,\n \"hidden-size\": 6144,\n \"num-attention-heads\": 64,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"rotary_pct\": 0.25,\n \"no-weight-tying\": true,\n \"gpt_j_residual\": true,\n \"gpt_j_tied\": true,\n \"output_layer_parallelism\": \"column\",\n \"scaled-upper-triang-masked-softmax-fusion\": true,\n \"bias-gelu-fusion\": true,\n\n # init methods\n \"init_method\": \"small_init\",\n \"output_layer_init_method\": \"wang_init\",\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 0.97e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n\n \"min_lr\": 0.97e-5,\n\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 1260000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 1260000000,\n \"contiguous_gradients\": True,\n },\n\n # batch / data settings (assuming 96 GPUs)\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 32,\n \"data-impl\": \"mmap\",\n \"split\": \"995,4,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": false,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"loss_scale_window\": 1000,\n \"initial_scale_power\": 12,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 150000,\n \"lr-decay-iters\": 150000,\n\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"checkpoint-factor\": 500,\n \"eval-interval\": 1000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 2,\n \"steps_per_print\": 2,\n \"wall_clock_breakdown\": false,\n\n ### NEW DATA: ####\n \"tokenizer_type\": \"HFTokenizer\",\n \"tensorboard-dir\": \"./tensorboard\",\n \"log-dir\": \"./logs\",\n\n}\n", "text_generation_interactive.yml": "# Parameters used for text generation\n# Make sure load
is specified somewhere else\n{\n # Text gen type: input-file
, unconditional
or interactive
\n \"text-gen-type\": \"interactive\",\n\n # Params for all\n \"maximum_tokens\": 500,\n \"temperature\": 0.9,\n \"top_p\": 0,\n \"top_k\": 0,\n \"recompute\": false,\n\n # unconditional
: samples\n \"num-samples\": 10,\n\n # input/output file\n \"sample-input-file\": \"prompt.txt\",\n \"sample-output-file\": \"sample_output.txt\",\n}\n"}, "load": "./20B_checkpoints", "checkpoint_factor": 500, "batch_size": 4, "train_iters": 150000, "eval_iters": 10, "split": "995,4,1", "vocab_file": "./20B_checkpoints/20B_tokenizer.json", "attention_dropout": 0, "hidden_dropout": 0, "checkpoint_activations": true, "synchronize_each_layer": true, "gas": 32, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 2, "model_parallel_size": 2, "is_pipe_parallel": true, "wandb_group": "72fp5jTbC3iYzFUHnE9Fh2_35wkyadj", "log_dir": "./logs", "tensorboard_dir": "./tensorboard", "log_interval": 2, "text_gen_type": "interactive", "temperature": 0.9, "maximum_tokens": 500, "sample_input_file": "prompt.txt", "sample_output_file": "sample_output.txt", "num_samples": 10, "user_script": "generate.py", "save_iters": [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500, 50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500, 73000, 73500, 74000, 74500, 75000, 75500, 76000, 76500, 77000, 77500, 78000, 78500, 79000, 79500, 80000, 80500, 81000, 81500, 82000, 82500, 83000, 83500, 84000, 84500, 85000, 85500, 86000, 86500, 87000, 87500, 88000, 88500, 89000, 89500, 90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000, 96500, 97000, 97500, 98000, 98500, 99000, 99500, 100000, 100500, 101000, 101500, 102000, 102500, 103000, 103500, 104000, 104500, 105000, 105500, 106000, 106500, 107000, 107500, 108000, 108500, 109000, 109500, 110000, 110500, 111000, 111500, 112000, 112500, 113000, 113500, 114000, 114500, 115000, 115500, 116000, 116500, 117000, 117500, 118000, 118500, 119000, 119500, 120000, 120500, 121000, 121500, 122000, 122500, 123000, 123500, 124000, 124500, 125000, 125500, 126000, 126500, 127000, 127500, 128000, 128500, 129000, 129500, 130000, 130500, 131000, 131500, 132000, 132500, 133000, 133500, 134000, 134500, 135000, 135500, 136000, 136500, 137000, 137500, 138000, 138500, 139000, 139500, 140000, 140500, 141000, 141500, 142000, 142500, 143000, 143500, 144000, 144500, 145000, 145500, 146000, 146500, 147000, 147500, 148000, 148500, 149000, 149500], "global_num_gpus": 4}']' returned non-zero exit status 1.
mchorse@473650d01`
bug