I am trying to setup Polycoder inferencing on my machine with 2xP100 GPUs, and use the docker command as available in README:
nvidia-docker run --rm -it -e NVIDIA_VISIBLE_DEVICES=0,1 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD/Downloads/checkpoints/checkpoints-2-7B,dst=/gpt-neox/checkpoints vhellendoorn/code-lms-neox:base
And then within the container:
sudo ./deepy.py generate.py configs/text_generation.yml checkpoints/configs/local_setup.yml checkpoints/configs/2-7B.yml
The following is the output (stdout+stderr):
NeoXArgs.from_ymls() ['configs/text_generation.yml', 'checkpoints/configs/local_setup.yml', 'checkpoints/configs/2-7B.yml']
INFO:root:NeoXArgs.calculate_derived() Total number of GPUs determined to be: 2
-------------------- arguments --------------------
attention_config ................ ['global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global']updated
attention_dropout ............... 0...........................updated
batch_size ...................... 8...........................updated
bias_gelu_fusion ................ True........................updated
checkpoint_activations .......... True........................updated
clip_grad ....................... 1.0.........................updated
config_files .................... {'text_generation.yml': '# Parameters used for text generation\n# Make sure `load` is specified somewhere else\n{\n # Text gen type: `input-file`, `unconditional` or `interactive`\n "text-gen-type": "interactive",\n \n # Params for all\n "maximum_tokens": 256,\n "temperature": 0.5,\n "top_p": 0.0,\n "top_k": 0,\n "recompute": false,\n \n # `unconditional`: samples\n "num-samples": 10,\n\n # input/output file\n "sample-input-file": "sample_input.txt",\n "sample-output-file": "sample_output.txt",\n}', 'local_setup.yml': '# Suggested data paths when using GPT-NeoX locally\n{\n "data-path": "data/code/code_text_document",\n \n # or for weighted datasets: \n # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],\n # "test-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],\n # "valid-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],\n # "train-data-weights": [1., 2.],\n # "test-data-weights": [2., 1.],\n # "valid-data-weights": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. \n # WARNING: setting this to True will override any user provided weights\n # "weight_by_num_documents": false,\n # "weighted_sampler_alpha": 0.3,\n\n "vocab-file": "data/code-vocab.json",\n "merge-file": "data/code-merges.txt",\n\n "save": "checkpoints",\n "load": "checkpoints",\n "checkpoint_validation_with_forward_pass": False,\n \n "tensorboard-dir": "tensorboard",\n "log-dir": "logs",\n "use_wandb": True,\n "wandb_host": "https://api.wandb.ai",\n "wandb_project": "neox"\n}', '2-7B.yml': '# GPT-2 pretraining setup\n{\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n "pipe-parallel-size": 1,\n "model-parallel-size": 1,\n\n # model settings\n "num-layers": 32,\n "hidden-size": 2560,\n "num-attention-heads": 32,\n "seq-length": 2048,\n "max-position-embeddings": 2048,\n "norm": "layernorm",\n "pos-emb": "rotary",\n "no-weight-tying": true,\n\n # these should provide some speedup but takes a while to build, set to true if desired\n "scaled-upper-triang-masked-softmax-fusion": true,\n "bias-gelu-fusion": true,\n\n # optimizer settings\n "zero_allow_untested_optimizer": true,\n "optimizer": {\n "type": "adam",\n "params": {\n "lr": 0.00016,\n "betas": [0.9, 0.999],\n "eps": 1.0e-8,\n }\n },\n "zero_optimization": {\n "stage": 1,\n "allgather_partitions": True,\n "allgather_bucket_size": 500000000,\n "overlap_comm": True,\n "reduce_scatter": True,\n "reduce_bucket_size": 500000000,\n "contiguous_gradients": True,\n "cpu_offload": False\n },\n\n # batch / data settings\n "train_micro_batch_size_per_gpu": 8,\n "gradient_accumulation_steps": 4,\n "data-impl": "mmap",\n "split": "989,10,1",\n\n # activation checkpointing\n "checkpoint-activations": true,\n "checkpoint-num-layers": 1,\n "partition-activations": true,\n "synchronize-each-layer": true,\n\n # regularization\n "gradient_clipping": 1.0,\n "weight-decay": 0,\n "hidden-dropout": 0,\n "attention-dropout": 0,\n\n # precision settings\n "fp16": { \n "fp16": true,\n "enabled": true,\n "loss_scale": 0,\n "initial_scale_power": 16,\n "loss_scale_window": 1000,\n "hysteresis": 2,\n "min_loss_scale": 1\n },\n\n # misc. training settings\n "train-iters": 160000,\n "lr-decay-iters": 160000,\n "distributed-backend": "nccl",\n "lr-decay-style": "cosine",\n "warmup": 0.01,\n "save-interval": 1000,\n "eval-interval": 1000,\n "eval-iters": 10,\n\n # logging\n "log-interval": 100,\n "steps_per_print": 10,\n "keep-last-n-checkpoints": 1,\n "wall_clock_breakdown": true,\n}\n'}updated
data_impl ....................... mmap........................updated
data_path ....................... data/code/code_text_documentupdated
dynamic_loss_scale .............. True........................updated
eval_iters ...................... 10..........................updated
fp16 ............................ {'fp16': True, 'enabled': True, 'loss_scale': 0, 'initial_scale_power': 16, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}updated
gas ............................. 4...........................updated
global_num_gpus ................. 2...........................updated
gradient_accumulation_steps ..... 4...........................updated
gradient_clipping ............... 1.0.........................updated
hidden_dropout .................. 0...........................updated
hidden_size ..................... 2560........................updated
is_pipe_parallel ................ True........................updated
keep_last_n_checkpoints ......... 1...........................updated
load ............................ checkpoints.................updated
log_dir ......................... logs........................updated
log_interval .................... 100.........................updated
lr .............................. 0.00016.....................updated
lr_decay_iters .................. 160000......................updated
lr_decay_style .................. cosine......................updated
max_position_embeddings ......... 2048........................updated
maximum_tokens .................. 256.........................updated
merge_file ...................... data/code-merges.txt........updated
no_weight_tying ................. True........................updated
num_attention_heads ............. 32..........................updated
num_layers ...................... 32..........................updated
num_samples ..................... 10..........................updated
optimizer ....................... {'type': 'adam', 'params': {'lr': 0.00016, 'betas': [0.9, 0.999], 'eps': 1e-08}}updated
partition_activations ........... True........................updated
pipe_parallel_size .............. 1...........................updated
pos_emb ......................... rotary......................updated
precision ....................... fp16........................updated
sample_input_file ............... sample_input.txt............updated
sample_output_file .............. sample_output.txt...........updated
save ............................ checkpoints.................updated
save_interval ................... 1000........................updated
scaled_upper_triang_masked_softmax_fusion True...............updated
seq_length ...................... 2048........................updated
sparsity_config ................. {}..........................updated
split ........................... 989,10,1....................updated
synchronize_each_layer .......... True........................updated
temperature ..................... 0.5.........................updated
tensorboard_dir ................. tensorboard.................updated
text_gen_type ................... interactive.................updated
train_batch_size ................ 64..........................updated
train_iters ..................... 160000......................updated
train_micro_batch_size_per_gpu .. 8...........................updated
use_wandb ....................... True........................updated
user_script ..................... generate.py.................updated
vocab_file ...................... data/code-vocab.json........updated
wall_clock_breakdown ............ True........................updated
wandb_group ..................... jtRPtjruy7PQkWHayfg7cH_6sweym4supdated
weight_decay .................... 0...........................updated
zero_allgather_bucket_size ...... 500000000...................updated
zero_allow_untested_optimizer ... True........................updated
zero_contiguous_gradients ....... True........................updated
zero_optimization ............... {'stage': 1, 'allgather_partitions': True, 'allgather_bucket_size': 500000000, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 500000000, 'contiguous_gradients': True, 'cpu_offload': False}updated
zero_reduce_bucket_size ......... 500000000...................updated
zero_reduce_scatter ............. True........................updated
zero_stage ...................... 1...........................updated
activation ...................... gelu........................default
adlr_autoresume ................. False.......................default
adlr_autoresume_interval ........ 1000........................default
amp ............................. None........................default
apply_query_key_layer_scaling ... False.......................default
attention_softmax_in_fp32 ....... False.......................default
bias_dropout_fusion ............. False.......................default
char_level_ppl .................. False.......................default
checkpoint_in_cpu ............... False.......................default
checkpoint_num_layers ........... 1...........................default
checkpoint_validation_with_forward_pass False................default
contiguous_checkpointing ........ False.......................default
deepscale ....................... False.......................default
deepscale_config ................ None........................default
deepspeed ....................... True........................default
deepspeed_activation_checkpointing True......................default
deepspeed_mpi ................... False.......................default
detect_nvlink_pairs ............. False.......................default
distributed_backend ............. nccl........................default
do_test ......................... None........................default
do_train ........................ None........................default
do_valid ........................ None........................default
dump_state ...................... False.......................default
eod_mask_loss ................... False.......................default
eval_interval ................... 1000........................default
eval_results_prefix ............. ............................default
eval_tasks ...................... None........................default
exclude ......................... None........................default
exit_interval ................... None........................default
finetune ........................ False.......................default
flops_profiler .................. None........................default
fp16_lm_cross_entropy ........... False.......................default
fp32_allreduce .................. False.......................default
git_hash ........................ 98683ae.....................default
gmlp_attn_dim ................... 64..........................default
gpt_j_residual .................. False.......................default
gradient_noise_scale_cpu_offload False.......................default
gradient_noise_scale_n_batches .. 5...........................default
gradient_predivide_factor ....... 1.0.........................default
hostfile ........................ None........................default
hysteresis ...................... 2...........................default
include ......................... None........................default
init_method ..................... normal......................default
init_method_std ................. 0.02........................default
iteration ....................... None........................default
launcher ........................ pdsh........................default
layernorm_epsilon ............... 1e-05.......................default
lazy_mpu_init ................... False.......................default
local_rank ...................... None........................default
log_grad_norm ................... False.......................default
log_gradient_noise_scale ........ False.......................default
log_optimizer_states ............ False.......................default
log_param_norm .................. False.......................default
loss_scale ...................... None........................default
loss_scale_window ............... 1000.0......................default
make_vocab_size_divisible_by .... 128.........................default
master_addr ..................... None........................default
master_port ..................... 29500.......................default
min_lr .......................... 0.0.........................default
min_scale ....................... 1.0.........................default
mmap_warmup ..................... False.......................default
model_parallel_size ............. 1...........................default
no_load_optim ................... False.......................default
no_load_rng ..................... False.......................default
no_save_optim ................... False.......................default
no_save_rng ..................... False.......................default
norm ............................ layernorm...................default
num_gpus ........................ None........................default
num_nodes ....................... -1..........................default
num_unique_layers ............... None........................default
num_workers ..................... 2...........................default
onnx_safe ....................... False.......................default
optimizer_type .................. adam........................default
output_layer_init_method ........ scaled_normal...............default
output_layer_parallelism ........ row.........................default
override_lr_scheduler ........... False.......................default
padded_vocab_size ............... None........................default
param_sharing_style ............. grouped.....................default
pipe_partition_method ........... type:transformer|mlp........default
prescale_gradients .............. False.......................default
profile_backward ................ False.......................default
rank ............................ None........................default
recompute ....................... False.......................default
rms_norm_epsilon ................ 1e-08.......................default
rotary_emb_base ................. 10000.......................default
rotary_pct ...................... 1.0.........................default
rpe_max_distance ................ 128.........................default
rpe_num_buckets ................. 32..........................default
scaled_masked_softmax_fusion .... False.......................default
scalenorm_epsilon ............... 1e-08.......................default
scheduler ....................... None........................default
seed ............................ 1234........................default
short_seq_prob .................. 0.1.........................default
soft_prompt_tuning .............. None........................default
sparse_gradients ................ False.......................default
steps_per_print ................. 10..........................default
test_data_paths ................. None........................default
test_data_weights ............... None........................default
tokenizer_type .................. GPT2BPETokenizer............default
top_k ........................... 0...........................default
top_p ........................... 0.0.........................default
train_data_paths ................ None........................default
train_data_weights .............. None........................default
use_bnb_optimizer ............... False.......................default
use_checkpoint_lr_scheduler ..... False.......................default
use_cpu_initialization .......... False.......................default
valid_data_paths ................ None........................default
valid_data_weights .............. None........................default
wandb_host ...................... https://api.wandb.ai........default
wandb_project ................... neox........................default
wandb_team ...................... None........................default
warmup .......................... 0.01........................default
weight_by_num_documents ......... False.......................default
weighted_sampler_alpha .......... 0.3.........................default
world_size ...................... None........................default
---------------- end of arguments ----------------
[2022-07-21 05:12:58,859] [WARNING] [runner.py:126:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2022-07-21 05:12:58,860] [INFO] [runner.py:366:main] cmd = /usr/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 generate.py --deepspeed_config {"train_batch_size": 64, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true} --megatron_config {"train_batch_size": 64, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true, "precision": "fp16", "num_layers": 32, "hidden_size": 2560, "num_attention_heads": 32, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, "lr_decay_style": "cosine", "lr_decay_iters": 160000, "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.00016, "data_path": "data/code/code_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"text_generation.yml": "# Parameters used for text generation\n# Make sure `load` is specified somewhere else\n{\n # Text gen type: `input-file`, `unconditional` or `interactive`\n \"text-gen-type\": \"interactive\",\n \n # Params for all\n \"maximum_tokens\": 256,\n \"temperature\": 0.5,\n \"top_p\": 0.0,\n \"top_k\": 0,\n \"recompute\": false,\n \n # `unconditional`: samples\n \"num-samples\": 10,\n\n # input/output file\n \"sample-input-file\": \"sample_input.txt\",\n \"sample-output-file\": \"sample_output.txt\",\n}", "local_setup.yml": "# Suggested data paths when using GPT-NeoX locally\n{\n \"data-path\": \"data/code/code_text_document\",\n \n # or for weighted datasets: \n # \"train-data-paths\": [\"data/enron/enron_text_document\", \"data/enron/enron_text_document\"],\n # \"test-data-paths\": [\"data/enron/enron_text_document\", \"data/enron/enron_text_document\"],\n # \"valid-data-paths\": [\"data/enron/enron_text_document\", \"data/enron/enron_text_document\"],\n # \"train-data-weights\": [1., 2.],\n # \"test-data-weights\": [2., 1.],\n # \"valid-data-weights\": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. \n # WARNING: setting this to True will override any user provided weights\n # \"weight_by_num_documents\": false,\n # \"weighted_sampler_alpha\": 0.3,\n\n \"vocab-file\": \"data/code-vocab.json\",\n \"merge-file\": \"data/code-merges.txt\",\n\n \"save\": \"checkpoints\",\n \"load\": \"checkpoints\",\n \"checkpoint_validation_with_forward_pass\": False,\n \n \"tensorboard-dir\": \"tensorboard\",\n \"log-dir\": \"logs\",\n \"use_wandb\": True,\n \"wandb_host\": \"https://api.wandb.ai\",\n \"wandb_project\": \"neox\"\n}", "2-7B.yml": "# GPT-2 pretraining setup\n{\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 32,\n \"hidden-size\": 2560,\n \"num-attention-heads\": 32,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": true,\n \"bias-gelu-fusion\": true,\n\n # optimizer settings\n \"zero_allow_untested_optimizer\": true,\n \"optimizer\": {\n \"type\": \"adam\",\n \"params\": {\n \"lr\": 0.00016,\n \"betas\": [0.9, 0.999],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 8,\n \"gradient_accumulation_steps\": 4,\n \"data-impl\": \"mmap\",\n \"split\": \"989,10,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"fp16\": { \n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 16,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 160000,\n \"lr-decay-iters\": 160000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 1000,\n \"eval-interval\": 1000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 1,\n \"wall_clock_breakdown\": true,\n}\n"}, "load": "checkpoints", "save_interval": 1000, "batch_size": 8, "train_iters": 160000, "eval_iters": 10, "keep_last_n_checkpoints": 1, "split": "989,10,1", "vocab_file": "data/code-vocab.json", "merge_file": "data/code-merges.txt", "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 4, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 1, "is_pipe_parallel": true, "use_wandb": true, "wandb_group": "jtRPtjruy7PQkWHayfg7cH_6sweym4s", "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 100, "text_gen_type": "interactive", "temperature": 0.5, "maximum_tokens": 256, "sample_input_file": "sample_input.txt", "sample_output_file": "sample_output.txt", "num_samples": 10, "user_script": "generate.py", "global_num_gpus": 2}
[2022-07-21 05:12:59,743] [INFO] [launch.py:82:main] WORLD INFO DICT: {'localhost': [0, 1]}
[2022-07-21 05:12:59,743] [INFO] [launch.py:88:main] nnodes=1, num_local_procs=2, node_rank=0
[2022-07-21 05:12:59,743] [INFO] [launch.py:103:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
[2022-07-21 05:12:59,743] [INFO] [launch.py:104:main] dist_world_size=2
[2022-07-21 05:12:59,743] [INFO] [launch.py:112:main] Setting CUDA_VISIBLE_DEVICES=0,1
NeoXArgs.configure_distributed_args() using world size: 2 and model-parallel size: 1
> building GPT2BPETokenizer tokenizer ...
> padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
> initializing torch distributed ...
[2022-07-21 05:13:02,390] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
[2022-07-21 05:13:02,482] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
> initializing model parallel with size 1
MPU DP: [0, 1]
MPU PP: [0]
MPU PP: [1]
MPU MP: [0]
MPU MP: [1]
> setting random seeds to 1234 ...
[2022-07-21 05:13:02,518] [INFO] [checkpointing.py:223:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
make: Entering directory '/gpt-neox/megatron/data'
make: Nothing to be done for 'default'.
make: Leaving directory '/gpt-neox/megatron/data'
building GPT2 model ...
SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None
Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1}
[2022-07-21 05:13:02,651] [INFO] [module.py:363:_partition_layers] Partitioning pipeline stages with method type:transformer|mlp
stage=0 layers=37
0: EmbeddingPipe
1: _pre_transformer_block
2: ParallelTransformerLayerPipe
3: ParallelTransformerLayerPipe
4: ParallelTransformerLayerPipe
5: ParallelTransformerLayerPipe
6: ParallelTransformerLayerPipe
7: ParallelTransformerLayerPipe
8: ParallelTransformerLayerPipe
9: ParallelTransformerLayerPipe
10: ParallelTransformerLayerPipe
11: ParallelTransformerLayerPipe
12: ParallelTransformerLayerPipe
13: ParallelTransformerLayerPipe
14: ParallelTransformerLayerPipe
15: ParallelTransformerLayerPipe
16: ParallelTransformerLayerPipe
17: ParallelTransformerLayerPipe
18: ParallelTransformerLayerPipe
19: ParallelTransformerLayerPipe
20: ParallelTransformerLayerPipe
21: ParallelTransformerLayerPipe
22: ParallelTransformerLayerPipe
23: ParallelTransformerLayerPipe
24: ParallelTransformerLayerPipe
25: ParallelTransformerLayerPipe
26: ParallelTransformerLayerPipe
27: ParallelTransformerLayerPipe
28: ParallelTransformerLayerPipe
29: ParallelTransformerLayerPipe
30: ParallelTransformerLayerPipe
31: ParallelTransformerLayerPipe
32: ParallelTransformerLayerPipe
33: ParallelTransformerLayerPipe
34: _post_transformer_block
35: NormPipe
36: ParallelLinearPipe
DeepSpeed is enabled.
[2022-07-21 05:13:05,069] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.15+eb7f5cf, git-hash=eb7f5cf, git-branch=main
[2022-07-21 05:13:05,070] [WARNING] [config.py:77:_sanity_check] DeepSpeedConfig: cpu_offload is deprecated. Please use offload_optimizer.
[2022-07-21 05:13:05,102] [WARNING] [config.py:77:_sanity_check] DeepSpeedConfig: cpu_offload is deprecated. Please use offload_optimizer.
[2022-07-21 05:13:05,172] [INFO] [config.py:759:print] DeepSpeedEngine configuration:
[2022-07-21 05:13:05,173] [INFO] [config.py:763:print] activation_checkpointing_config {
"partition_activations": false,
"contiguous_memory_optimization": false,
"cpu_checkpointing": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
}
[2022-07-21 05:13:05,173] [INFO] [config.py:763:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2022-07-21 05:13:05,173] [INFO] [config.py:763:print] allreduce_always_fp32 ........ False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] amp_enabled .................. False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] amp_params ................... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] checkpoint_tag_validation_enabled True
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] checkpoint_tag_validation_fail False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] disable_allgather ............ False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] dump_state ................... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1}
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] elasticity_enabled ........... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] flops_profiler_config ........ {
"enabled": false,
"profile_step": 1,
"module_depth": -1,
"top_modules": 3,
"detailed": true
}
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] fp16_enabled ................. True
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] fp16_type .................... fp16
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] global_rank .................. 0
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] gradient_accumulation_steps .. 4
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] gradient_clipping ............ 1.0
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] gradient_predivide_factor .... 1.0
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] initial_dynamic_scale ........ 65536
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] loss_scale ................... 0
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] memory_breakdown ............. False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] optimizer_legacy_fusion ...... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] optimizer_name ............... adam
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] optimizer_params ............. {'lr': 0.00016, 'betas': [0.9, 0.999], 'eps': 1e-08}
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] pld_enabled .................. False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] pld_params ................... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] precision .................... torch.float16
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] prescale_gradients ........... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] scheduler_name ............... None
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] scheduler_params ............. None
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] sparse_attention ............. None
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] sparse_gradients_enabled ..... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] steps_per_print .............. 10
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] tensorboard_enabled .......... False
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] tensorboard_job_name ......... DeepSpeedJobName
[2022-07-21 05:13:05,174] [INFO] [config.py:763:print] tensorboard_output_path ......
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] train_batch_size ............. 64
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] train_micro_batch_size_per_gpu 8
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] wall_clock_breakdown ......... True
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] world_size ................... 2
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] zero_allow_untested_optimizer True
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] zero_config .................. {
"stage": 0,
"contiguous_gradients": false,
"reduce_scatter": true,
"reduce_bucket_size": 5.000000e+08,
"allgather_partitions": true,
"allgather_bucket_size": 5.000000e+08,
"overlap_comm": false,
"load_from_fp32_weights": true,
"elastic_checkpoint": true,
"offload_param": null,
"offload_optimizer": null,
"sub_group_size": 1.000000e+12,
"prefetch_bucket_size": 5.000000e+07,
"param_persistence_threshold": 1.000000e+05,
"max_live_parameters": 1.000000e+09,
"max_reuse_distance": 1.000000e+09,
"gather_fp16_weights_on_model_save": false
}
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] zero_enabled ................. False
[2022-07-21 05:13:05,175] [INFO] [config.py:763:print] zero_optimization_stage ...... 0
[2022-07-21 05:13:05,175] [INFO] [config.py:765:print] json = {
"train_batch_size": 64,
"train_micro_batch_size_per_gpu": 8,
"gradient_accumulation_steps": 4,
"optimizer": {
"type": "adam",
"params": {
"lr": 0.00016,
"betas": [0.9, 0.999],
"eps": 1e-08
}
},
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 0,
"allgather_partitions": true,
"reduce_scatter": true,
"allgather_bucket_size": 5.000000e+08,
"overlap_comm": false,
"reduce_bucket_size": 5.000000e+08,
"contiguous_gradients": false,
"cpu_offload": false
},
"wall_clock_breakdown": true,
"zero_allow_untested_optimizer": true
}
Using /root/.cache/torch_extensions as PyTorch extensions root...
Using /root/.cache/torch_extensions as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.37787580490112305 seconds
[2022-07-21 05:13:05,558] [INFO] [engine.py:84:__init__] CONFIG: micro_batches=4 micro_batch_size=8
Loading extension module utils...
Time to load utils op: 0.40610766410827637 seconds
[2022-07-21 05:13:05,679] [INFO] [engine.py:141:__init__] RANK=0 STAGE=0 LAYERS=37 [0, 37) STAGE_PARAMS=2775208960 (2775.209M) TOTAL_PARAMS=2775208960 (2775.209M) UNIQUE_PARAMS=2775208960 (2775.209M)
> number of parameters on model parallel rank 0: 2775208960
> total params: 2,775,208,960
[2022-07-21 05:13:05,702] [INFO] [engine.py:1551:_load_checkpoint] rank: 0 loading checkpoint: checkpoints/global_step150000/mp_rank_00_model_states.pt
[2022-07-21 05:13:05,702] [INFO] [engine.py:1551:_load_checkpoint] rank: 1 loading checkpoint: checkpoints/global_step150000/mp_rank_00_model_states.pt
[2022-07-21 05:13:05,901] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=0 file=checkpoints/global_step150000/layer_00-model_00-model_states.pt
[2022-07-21 05:13:06,022] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=2 file=checkpoints/global_step150000/layer_02-model_00-model_states.pt
[2022-07-21 05:13:06,138] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=3 file=checkpoints/global_step150000/layer_03-model_00-model_states.pt
[2022-07-21 05:13:06,254] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=4 file=checkpoints/global_step150000/layer_04-model_00-model_states.pt
[2022-07-21 05:13:06,370] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=5 file=checkpoints/global_step150000/layer_05-model_00-model_states.pt
[2022-07-21 05:13:06,481] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=6 file=checkpoints/global_step150000/layer_06-model_00-model_states.pt
[2022-07-21 05:13:06,592] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=7 file=checkpoints/global_step150000/layer_07-model_00-model_states.pt
[2022-07-21 05:13:06,730] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=8 file=checkpoints/global_step150000/layer_08-model_00-model_states.pt
[2022-07-21 05:13:06,854] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=9 file=checkpoints/global_step150000/layer_09-model_00-model_states.pt
[2022-07-21 05:13:06,968] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=10 file=checkpoints/global_step150000/layer_10-model_00-model_states.pt
[2022-07-21 05:13:07,083] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=11 file=checkpoints/global_step150000/layer_11-model_00-model_states.pt
[2022-07-21 05:13:07,199] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=12 file=checkpoints/global_step150000/layer_12-model_00-model_states.pt
[2022-07-21 05:13:07,313] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=13 file=checkpoints/global_step150000/layer_13-model_00-model_states.pt
[2022-07-21 05:13:07,433] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=14 file=checkpoints/global_step150000/layer_14-model_00-model_states.pt
[2022-07-21 05:13:07,550] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=15 file=checkpoints/global_step150000/layer_15-model_00-model_states.pt
[2022-07-21 05:13:07,667] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=16 file=checkpoints/global_step150000/layer_16-model_00-model_states.pt
[2022-07-21 05:13:07,782] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=17 file=checkpoints/global_step150000/layer_17-model_00-model_states.pt
[2022-07-21 05:13:07,899] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=18 file=checkpoints/global_step150000/layer_18-model_00-model_states.pt
[2022-07-21 05:13:08,007] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=19 file=checkpoints/global_step150000/layer_19-model_00-model_states.pt
[2022-07-21 05:13:08,142] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=20 file=checkpoints/global_step150000/layer_20-model_00-model_states.pt
[2022-07-21 05:13:08,251] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=21 file=checkpoints/global_step150000/layer_21-model_00-model_states.pt
[2022-07-21 05:13:08,358] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=22 file=checkpoints/global_step150000/layer_22-model_00-model_states.pt
[2022-07-21 05:13:08,466] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=23 file=checkpoints/global_step150000/layer_23-model_00-model_states.pt
[2022-07-21 05:13:08,574] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=24 file=checkpoints/global_step150000/layer_24-model_00-model_states.pt
[2022-07-21 05:13:08,681] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=25 file=checkpoints/global_step150000/layer_25-model_00-model_states.pt
[2022-07-21 05:13:08,786] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=26 file=checkpoints/global_step150000/layer_26-model_00-model_states.pt
[2022-07-21 05:13:08,894] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=27 file=checkpoints/global_step150000/layer_27-model_00-model_states.pt
[2022-07-21 05:13:09,003] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=28 file=checkpoints/global_step150000/layer_28-model_00-model_states.pt
[2022-07-21 05:13:09,114] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=29 file=checkpoints/global_step150000/layer_29-model_00-model_states.pt
[2022-07-21 05:13:09,222] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=30 file=checkpoints/global_step150000/layer_30-model_00-model_states.pt
[2022-07-21 05:13:09,332] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=31 file=checkpoints/global_step150000/layer_31-model_00-model_states.pt
[2022-07-21 05:13:09,438] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=32 file=checkpoints/global_step150000/layer_32-model_00-model_states.pt
[2022-07-21 05:13:09,544] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=33 file=checkpoints/global_step150000/layer_33-model_00-model_states.pt
[2022-07-21 05:13:09,544] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=35 file=checkpoints/global_step150000/layer_35-model_00-model_states.pt
[2022-07-21 05:13:09,752] [INFO] [module.py:576:load_state_dir] RANK=0 Loaded layer=36 file=checkpoints/global_step150000/layer_36-model_00-model_states.pt
> validated currently set args with arguments in the checkpoint ...
successfully loaded checkpoints/global_step150000/mp_rank_00_model_states.pt
Loading checkpoint and starting from iteration 150000
Finished loading model
[H[2J[3JContext prompt >>> def return1():\n """Returns 1."""\n
Traceback (most recent call last):
File "generate.py", line 74, in <module>
main()
File "generate.py", line 59, in main
generate_samples_interactive(
File "/gpt-neox/megatron/text_generation_utils.py", line 751, in generate_samples_interactive
for (
File "/gpt-neox/megatron/text_generation_utils.py", line 317, in stream_tokens
logits, layer_past = forward_model(neox_args, model, model_inputs)
File "/gpt-neox/megatron/text_generation_utils.py", line 137, in forward_model
return model.module(model_inputs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/module.py", line 335, in forward
x = func(forward_input)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/module.py", line 328, in exec_func
inputs = layer(inputs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/gpt-neox/megatron/model/transformer.py", line 686, in forward
outputs = super().forward(hidden_states, attention_mask, layer_past=past)
File "/gpt-neox/megatron/model/transformer.py", line 639, in forward
attention_output, attention_bias = self.attention(
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/gpt-neox/megatron/model/transformer.py", line 516, in forward
output, bias = self.dense(context_layer)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/gpt-neox/megatron/mpu/layers.py", line 446, in forward
output_parallel = F.linear(input_parallel, self.weight)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, &fbeta, c, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP)`
Traceback (most recent call last):
File "generate.py", line 74, in <module>
main()
File "generate.py", line 59, in main
generate_samples_interactive(
File "/gpt-neox/megatron/text_generation_utils.py", line 751, in generate_samples_interactive
for (
File "/gpt-neox/megatron/text_generation_utils.py", line 317, in stream_tokens
logits, layer_past = forward_model(neox_args, model, model_inputs)
File "/gpt-neox/megatron/text_generation_utils.py", line 137, in forward_model
return model.module(model_inputs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/module.py", line 335, in forward
x = func(forward_input)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/pipe/module.py", line 328, in exec_func
inputs = layer(inputs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/gpt-neox/megatron/model/transformer.py", line 686, in forward
outputs = super().forward(hidden_states, attention_mask, layer_past=past)
File "/gpt-neox/megatron/model/transformer.py", line 639, in forward
attention_output, attention_bias = self.attention(
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/gpt-neox/megatron/model/transformer.py", line 516, in forward
output, bias = self.dense(context_layer)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/gpt-neox/megatron/mpu/layers.py", line 446, in forward
output_parallel = F.linear(input_parallel, self.weight)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, &fbeta, c, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP)`
Killing subprocess 1054
Killing subprocess 1055
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 179, in <module>
main()
File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 169, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 147, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python', '-u', 'generate.py', '--local_rank=1', '--deepspeed_config', '{"train_batch_size": 64, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true}', '--megatron_config', '{"train_batch_size": 64, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true, "precision": "fp16", "num_layers": 32, "hidden_size": 2560, "num_attention_heads": 32, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, "lr_decay_style": "cosine", "lr_decay_iters": 160000, "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.00016, "data_path": "data/code/code_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"text_generation.yml": "# Parameters used for text generation\\n# Make sure `load` is specified somewhere else\\n{\\n # Text gen type: `input-file`, `unconditional` or `interactive`\\n \\"text-gen-type\\": \\"interactive\\",\\n \\n # Params for all\\n \\"maximum_tokens\\": 256,\\n \\"temperature\\": 0.5,\\n \\"top_p\\": 0.0,\\n \\"top_k\\": 0,\\n \\"recompute\\": false,\\n \\n # `unconditional`: samples\\n \\"num-samples\\": 10,\\n\\n # input/output file\\n \\"sample-input-file\\": \\"sample_input.txt\\",\\n \\"sample-output-file\\": \\"sample_output.txt\\",\\n}", "local_setup.yml": "# Suggested data paths when using GPT-NeoX locally\\n{\\n \\"data-path\\": \\"data/code/code_text_document\\",\\n \\n # or for weighted datasets: \\n # \\"train-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n # \\"test-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n # \\"valid-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n # \\"train-data-weights\\": [1., 2.],\\n # \\"test-data-weights\\": [2., 1.],\\n # \\"valid-data-weights\\": [0.5, 0.4],\\n\\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. \\n # WARNING: setting this to True will override any user provided weights\\n # \\"weight_by_num_documents\\": false,\\n # \\"weighted_sampler_alpha\\": 0.3,\\n\\n \\"vocab-file\\": \\"data/code-vocab.json\\",\\n \\"merge-file\\": \\"data/code-merges.txt\\",\\n\\n \\"save\\": \\"checkpoints\\",\\n \\"load\\": \\"checkpoints\\",\\n \\"checkpoint_validation_with_forward_pass\\": False,\\n \\n \\"tensorboard-dir\\": \\"tensorboard\\",\\n \\"log-dir\\": \\"logs\\",\\n \\"use_wandb\\": True,\\n \\"wandb_host\\": \\"https://api.wandb.ai\\",\\n \\"wandb_project\\": \\"neox\\"\\n}", "2-7B.yml": "# GPT-2 pretraining setup\\n{\\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\\n # across the node boundaries )\\n \\"pipe-parallel-size\\": 1,\\n \\"model-parallel-size\\": 1,\\n\\n # model settings\\n \\"num-layers\\": 32,\\n \\"hidden-size\\": 2560,\\n \\"num-attention-heads\\": 32,\\n \\"seq-length\\": 2048,\\n \\"max-position-embeddings\\": 2048,\\n \\"norm\\": \\"layernorm\\",\\n \\"pos-emb\\": \\"rotary\\",\\n \\"no-weight-tying\\": true,\\n\\n # these should provide some speedup but takes a while to build, set to true if desired\\n \\"scaled-upper-triang-masked-softmax-fusion\\": true,\\n \\"bias-gelu-fusion\\": true,\\n\\n # optimizer settings\\n \\"zero_allow_untested_optimizer\\": true,\\n \\"optimizer\\": {\\n \\"type\\": \\"adam\\",\\n \\"params\\": {\\n \\"lr\\": 0.00016,\\n \\"betas\\": [0.9, 0.999],\\n \\"eps\\": 1.0e-8,\\n }\\n },\\n \\"zero_optimization\\": {\\n \\"stage\\": 1,\\n \\"allgather_partitions\\": True,\\n \\"allgather_bucket_size\\": 500000000,\\n \\"overlap_comm\\": True,\\n \\"reduce_scatter\\": True,\\n \\"reduce_bucket_size\\": 500000000,\\n \\"contiguous_gradients\\": True,\\n \\"cpu_offload\\": False\\n },\\n\\n # batch / data settings\\n \\"train_micro_batch_size_per_gpu\\": 8,\\n \\"gradient_accumulation_steps\\": 4,\\n \\"data-impl\\": \\"mmap\\",\\n \\"split\\": \\"989,10,1\\",\\n\\n # activation checkpointing\\n \\"checkpoint-activations\\": true,\\n \\"checkpoint-num-layers\\": 1,\\n \\"partition-activations\\": true,\\n \\"synchronize-each-layer\\": true,\\n\\n # regularization\\n \\"gradient_clipping\\": 1.0,\\n \\"weight-decay\\": 0,\\n \\"hidden-dropout\\": 0,\\n \\"attention-dropout\\": 0,\\n\\n # precision settings\\n \\"fp16\\": { \\n \\"fp16\\": true,\\n \\"enabled\\": true,\\n \\"loss_scale\\": 0,\\n \\"initial_scale_power\\": 16,\\n \\"loss_scale_window\\": 1000,\\n \\"hysteresis\\": 2,\\n \\"min_loss_scale\\": 1\\n },\\n\\n # misc. training settings\\n \\"train-iters\\": 160000,\\n \\"lr-decay-iters\\": 160000,\\n \\"distributed-backend\\": \\"nccl\\",\\n \\"lr-decay-style\\": \\"cosine\\",\\n \\"warmup\\": 0.01,\\n \\"save-interval\\": 1000,\\n \\"eval-interval\\": 1000,\\n \\"eval-iters\\": 10,\\n\\n # logging\\n \\"log-interval\\": 100,\\n \\"steps_per_print\\": 10,\\n \\"keep-last-n-checkpoints\\": 1,\\n \\"wall_clock_breakdown\\": true,\\n}\\n"}, "load": "checkpoints", "save_interval": 1000, "batch_size": 8, "train_iters": 160000, "eval_iters": 10, "keep_last_n_checkpoints": 1, "split": "989,10,1", "vocab_file": "data/code-vocab.json", "merge_file": "data/code-merges.txt", "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 4, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 1, "is_pipe_parallel": true, "use_wandb": true, "wandb_group": "jtRPtjruy7PQkWHayfg7cH_6sweym4s", "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 100, "text_gen_type": "interactive", "temperature": 0.5, "maximum_tokens": 256, "sample_input_file": "sample_input.txt", "sample_output_file": "sample_output.txt", "num_samples": 10, "user_script": "generate.py", "global_num_gpus": 2}']' returned non-zero exit status 1.