Browse Source

add nvidia megatron code

master
zhanghangit 1 month ago
parent
commit
bf716fb7c0
100 changed files with 105554 additions and 3 deletions
  1. +0
    -3
      README.md
  2. +30000
    -0
      bpe_3w_new/.ipynb_checkpoints/chinese_vocab-checkpoint.vocab
  3. +0
    -0
      bpe_3w_new/.ipynb_checkpoints/merges-checkpoint.txt
  4. +1
    -0
      bpe_3w_new/.ipynb_checkpoints/vocab-checkpoint.json
  5. BIN
      bpe_3w_new/chinese_vocab.model
  6. +30000
    -0
      bpe_3w_new/chinese_vocab.vocab
  7. +0
    -0
      bpe_3w_new/merges.txt
  8. +1
    -0
      bpe_3w_new/vocab.json
  9. +6
    -0
      nvidia-code/.gitignore
  10. BIN
      nvidia-code/bpe_3w_new/chinese_vocab.model
  11. +30000
    -0
      nvidia-code/bpe_3w_new/chinese_vocab.vocab
  12. +0
    -0
      nvidia-code/bpe_3w_new/merges.txt
  13. +1
    -0
      nvidia-code/bpe_3w_new/vocab.json
  14. +326
    -0
      nvidia-code/data/gpt2_dataset_dev_eval.py
  15. +120
    -0
      nvidia-code/eval_gpt2.py
  16. +57
    -0
      nvidia-code/examples/.ipynb_checkpoints/pretrain_gpt2_distributed_2.6B-checkpoint.sh
  17. BIN
      nvidia-code/examples/.pretrain_gpt2_distributed_2.6B.sh.swo
  18. +62
    -0
      nvidia-code/examples/debug_pretrain_gpt2_distributed_xxxM.sh
  19. +64
    -0
      nvidia-code/examples/evalPPL_gpt2_distributed.sh
  20. +38
    -0
      nvidia-code/examples/evaluate_zeroshot_gpt2.sh
  21. +44
    -0
      nvidia-code/examples/finetune_mnli_distributed.sh
  22. +47
    -0
      nvidia-code/examples/finetune_race_distributed.sh
  23. +25
    -0
      nvidia-code/examples/generate_text.sh
  24. +37
    -0
      nvidia-code/examples/generate_text_cmrc2018.sh
  25. +18
    -0
      nvidia-code/examples/merge_mp_bert.sh
  26. +35
    -0
      nvidia-code/examples/pretrain_bert.sh
  27. +44
    -0
      nvidia-code/examples/pretrain_bert_distributed.sh
  28. +43
    -0
      nvidia-code/examples/pretrain_gpt2.sh
  29. +52
    -0
      nvidia-code/examples/pretrain_gpt2_distributed.sh
  30. +57
    -0
      nvidia-code/examples/pretrain_gpt2_distributed_2.6B.sh
  31. +57
    -0
      nvidia-code/examples/pretrain_gpt2_distributed_345M.sh
  32. +142
    -0
      nvidia-code/get_ib_throughput.sh
  33. +13
    -0
      nvidia-code/ib_speed_stat.sh
  34. +12
    -0
      nvidia-code/images/Makefile
  35. BIN
      nvidia-code/images/cases.png
  36. BIN
      nvidia-code/images/scaling-dp.png
  37. BIN
      nvidia-code/images/scaling-mp.png
  38. +40
    -0
      nvidia-code/images/tables.tex
  39. +611
    -0
      nvidia-code/megatron/.ipynb_checkpoints/training-checkpoint.py
  40. +41
    -0
      nvidia-code/megatron/__init__.py
  41. +493
    -0
      nvidia-code/megatron/arguments.py
  42. +302
    -0
      nvidia-code/megatron/checkpointing.py
  43. BIN
      nvidia-code/megatron/data/.gpt2_dataset.py.swp
  44. +9
    -0
      nvidia-code/megatron/data/Makefile
  45. +1
    -0
      nvidia-code/megatron/data/__init__.py
  46. +232
    -0
      nvidia-code/megatron/data/bert_dataset.py
  47. +503
    -0
      nvidia-code/megatron/data/dataset_utils.py
  48. +317
    -0
      nvidia-code/megatron/data/gpt2_dataset.py
  49. +643
    -0
      nvidia-code/megatron/data/helpers.cpp
  50. BIN
      nvidia-code/megatron/data/helpers.cpython-36m-x86_64-linux-gnu.so
  51. +140
    -0
      nvidia-code/megatron/data/ict_dataset.py
  52. +570
    -0
      nvidia-code/megatron/data/indexed_dataset.py
  53. +201
    -0
      nvidia-code/megatron/data/realm_dataset_utils.py
  54. +216
    -0
      nvidia-code/megatron/data/realm_index.py
  55. +148
    -0
      nvidia-code/megatron/data/samplers.py
  56. +125
    -0
      nvidia-code/megatron/data/test/test_indexed_dataset.py
  57. +10
    -0
      nvidia-code/megatron/data/test/test_preprocess_data.sh
  58. +141
    -0
      nvidia-code/megatron/deprecated_data_utils/__init__.py
  59. +252
    -0
      nvidia-code/megatron/deprecated_data_utils/configure_data.py
  60. +61
    -0
      nvidia-code/megatron/deprecated_data_utils/corpora.py
  61. +883
    -0
      nvidia-code/megatron/deprecated_data_utils/datasets.py
  62. +253
    -0
      nvidia-code/megatron/deprecated_data_utils/file_utils.py
  63. +202
    -0
      nvidia-code/megatron/deprecated_data_utils/lazy_loader.py
  64. +143
    -0
      nvidia-code/megatron/deprecated_data_utils/samplers.py
  65. +27
    -0
      nvidia-code/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
  66. +141
    -0
      nvidia-code/megatron/deprecated_data_utils/scripts/split_gpt2_json.py
  67. +126
    -0
      nvidia-code/megatron/deprecated_data_utils/scripts/split_json.py
  68. +129
    -0
      nvidia-code/megatron/deprecated_data_utils/tf_dl.py
  69. +922
    -0
      nvidia-code/megatron/deprecated_data_utils/tokenization.py
  70. +319
    -0
      nvidia-code/megatron/deprecated_data_utils/tokenization_gpt2.py
  71. +391
    -0
      nvidia-code/megatron/deprecated_data_utils/wordpiece.py
  72. +598
    -0
      nvidia-code/megatron/eval_ppl.py
  73. +30
    -0
      nvidia-code/megatron/fp16/__init__.py
  74. +651
    -0
      nvidia-code/megatron/fp16/fp16.py
  75. +216
    -0
      nvidia-code/megatron/fp16/fp16util.py
  76. +256
    -0
      nvidia-code/megatron/fp16/loss_scaler.py
  77. +30
    -0
      nvidia-code/megatron/fp16_cpm/__init__.py
  78. +629
    -0
      nvidia-code/megatron/fp16_cpm/fp16.py
  79. +204
    -0
      nvidia-code/megatron/fp16_cpm/fp16util.py
  80. +237
    -0
      nvidia-code/megatron/fp16_cpm/loss_scaler.py
  81. +2
    -0
      nvidia-code/megatron/fp16_cpm/node0.log
  82. +100
    -0
      nvidia-code/megatron/fused_kernels/__init__.py
  83. +74
    -0
      nvidia-code/megatron/fused_kernels/scaled_masked_softmax.cpp
  84. +452
    -0
      nvidia-code/megatron/fused_kernels/scaled_masked_softmax.h
  85. +102
    -0
      nvidia-code/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
  86. +69
    -0
      nvidia-code/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
  87. +439
    -0
      nvidia-code/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
  88. +89
    -0
      nvidia-code/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
  89. +233
    -0
      nvidia-code/megatron/global_vars.py
  90. +91
    -0
      nvidia-code/megatron/indexer.py
  91. +168
    -0
      nvidia-code/megatron/initialize.py
  92. +123
    -0
      nvidia-code/megatron/learning_rates.py
  93. +145
    -0
      nvidia-code/megatron/memory.py
  94. +21
    -0
      nvidia-code/megatron/model/__init__.py
  95. +200
    -0
      nvidia-code/megatron/model/bert_model.py
  96. +97
    -0
      nvidia-code/megatron/model/classification.py
  97. +112
    -0
      nvidia-code/megatron/model/distributed.py
  98. +60
    -0
      nvidia-code/megatron/model/fused_bias_gelu.py
  99. +127
    -0
      nvidia-code/megatron/model/fused_softmax.py
  100. +105
    -0
      nvidia-code/megatron/model/gpt2_model.py

+ 0
- 3
README.md View File

@@ -1,3 +0,0 @@
# Chinese-Megatron

使用100G中文高质量语料,128张V100,训练的中文Megatron模型,参数量2.6B

+ 30000
- 0
bpe_3w_new/.ipynb_checkpoints/chinese_vocab-checkpoint.vocab
File diff suppressed because it is too large
View File


+ 0
- 0
bpe_3w_new/.ipynb_checkpoints/merges-checkpoint.txt View File


+ 1
- 0
bpe_3w_new/.ipynb_checkpoints/vocab-checkpoint.json
File diff suppressed because it is too large
View File


BIN
bpe_3w_new/chinese_vocab.model View File


+ 30000
- 0
bpe_3w_new/chinese_vocab.vocab
File diff suppressed because it is too large
View File


+ 0
- 0
bpe_3w_new/merges.txt View File


+ 1
- 0
bpe_3w_new/vocab.json
File diff suppressed because it is too large
View File


+ 6
- 0
nvidia-code/.gitignore View File

@@ -0,0 +1,6 @@
__pycache__

# Distribution / packaging
build/
dist/
*.egg-info/

BIN
nvidia-code/bpe_3w_new/chinese_vocab.model View File


+ 30000
- 0
nvidia-code/bpe_3w_new/chinese_vocab.vocab
File diff suppressed because it is too large
View File


+ 0
- 0
nvidia-code/bpe_3w_new/merges.txt View File


+ 1
- 0
nvidia-code/bpe_3w_new/vocab.json
File diff suppressed because it is too large
View File


+ 326
- 0
nvidia-code/data/gpt2_dataset_dev_eval.py View File

@@ -0,0 +1,326 @@
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""GPT2 style dataset."""

import os
import time

import numpy as np
import torch

from megatron import mpu, print_rank_0
from megatron.data.dataset_utils import get_train_valid_test_split_
from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset

from megatron import get_args


def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
train_valid_test_num_samples,
seq_length, seed, skip_warmup):
"""Build train, valid, and test datasets."""
# data_prefix: /gdata/ChineseCorpus/Megatron-training/data-v2-labeled/my-gpt2text_document
# Indexed dataset.
indexed_dataset = get_indexed_dataset_(data_prefix,
data_impl,
skip_warmup)

total_num_of_documents = indexed_dataset.sizes.shape[0]
splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
# Print stats about the splits.
print_rank_0(' > dataset split:')

def print_split_stats(name, index):
print_rank_0(' {}:'.format(name))
print_rank_0(' document indices in [{}, {}) total of {} '
'documents'.format(splits[index], splits[index + 1],
splits[index + 1] - splits[index]))
print_split_stats('train', 0)
print_split_stats('validation', 1)
print_split_stats('test', 2)

def build_dataset(index, name):
dataset = None
if splits[index + 1] > splits[index]:
documents = np.arange(start=splits[index], stop=splits[index + 1],
step=1, dtype=np.int32)
dataset = GPT2Dataset(name, data_prefix,
documents, indexed_dataset,
train_valid_test_num_samples[index],
seq_length, seed)
return dataset

train_dataset = build_dataset(0, 'train')
valid_dataset = build_dataset(1, 'valid')
test_dataset = build_dataset(2, 'test')

return (train_dataset, valid_dataset, test_dataset)


def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
"""Build indexed dataset."""
print_rank_0(' > building dataset index ...')

start_time = time.time()
indexed_dataset = make_indexed_dataset(data_prefix,
data_impl,
skip_warmup)
print_rank_0(' > finished creating indexed dataset in {:4f} '
'seconds'.format(time.time() - start_time))
print_rank_0(' number of documents: {}'.format(
indexed_dataset.sizes.shape[0]))

return indexed_dataset


class GPT2Dataset(torch.utils.data.Dataset):

def __init__(self, name, data_prefix, documents, indexed_dataset,
num_samples, seq_length, seed):

self.name = name
self.indexed_dataset = indexed_dataset

# Checks
assert np.min(documents) >= 0
assert np.max(documents) < indexed_dataset.sizes.shape[0]

# Build index mappings.
self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
self.name, data_prefix, documents, self.indexed_dataset.sizes,
num_samples, seq_length, seed)

def __len__(self):
# -1 is due to data structure used to retieve the index:
# sample i --> [sample_idx[i], sample_idx[i+1])
return self.sample_idx.shape[0] - 1

def __getitem__(self, idx):
# Get the shuffled index.
idx = self.shuffle_idx[idx]
# Start and end documents and offsets.
doc_index_f = self.sample_idx[idx][0]
doc_index_l = self.sample_idx[idx + 1][0]
offset_f = self.sample_idx[idx][1]
offset_l = self.sample_idx[idx + 1][1]
# If we are within the same document, just extract the chunk.
if doc_index_f == doc_index_l:
sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
offset=offset_f,
length=offset_l - offset_f + 1)
else:
# Otherwise, get the rest of the initial document.
sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
offset=offset_f)]
# Loop over all in between documents and add the entire document.
for i in range(doc_index_f + 1, doc_index_l):
sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
# And finally add the relevant portion of last document.
sample_list.append(self.indexed_dataset.get(
self.doc_idx[doc_index_l],
length=offset_l + 1))
sample = np.concatenate(sample_list)

return {'text': np.array(sample, dtype=np.int64)}


def _build_index_mappings(name, data_prefix, documents, sizes,
num_samples, seq_length, seed):
"""Build doc-idx, sample-idx, and shuffle-idx.
doc-idx: is an array (ordered) of documents to be used in training.
sample-idx: is the start document index and document offset for each
training sample.
shuffle-idx: maps the sample index into a random index into sample-idx.
"""
# Number of tokens in each epoch and number of required epochs.
tokens_per_epoch = _num_tokens(documents, sizes)
num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
args = get_args()
BATCH_SIZE = 128
USING_GPUS_NUM = 16
args.eval_iters = tokens_per_epoch // seq_length // (BATCH_SIZE * USING_GPUS_NUM) + 1
# rng state
np_rng = np.random.RandomState(seed=seed)

# Filename of the index mappings.
_filename = data_prefix
_filename += '_{}_indexmap'.format(name)
_filename += '_{}ns'.format(num_samples)
_filename += '_{}sl'.format(seq_length)
_filename += '_{}s'.format(seed)
doc_idx_filename = _filename + '_doc_idx.npy'
sample_idx_filename = _filename + '_sample_idx.npy'
shuffle_idx_filename = _filename + '_shuffle_idx.npy'

# Build the indexed mapping if not exist.
if torch.distributed.get_rank() == 0:
if (not os.path.isfile(doc_idx_filename)) or \
(not os.path.isfile(sample_idx_filename)) or \
(not os.path.isfile(shuffle_idx_filename)):

print_rank_0(' > WARNING: could not find index map files, building '
'the indices on rank 0 ...')
# doc-idx.
start_time = time.time()
doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
np.save(doc_idx_filename, doc_idx, allow_pickle=True)
print_rank_0(' > elasped time to build and save doc-idx mapping '
'(seconds): {:4f}'.format(time.time() - start_time))
# sample-idx.
start_time = time.time()
# Use C++ implementation for speed.
# First compile and then import.
from megatron.data.dataset_utils import compile_helper
compile_helper()
from megatron.data import helpers
assert doc_idx.dtype == np.int32
assert sizes.dtype == np.int32
sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
num_epochs, tokens_per_epoch)
# sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
# num_epochs, tokens_per_epoch)
np.save(sample_idx_filename, sample_idx, allow_pickle=True)
print_rank_0(' > elasped time to build and save sample-idx mapping '
'(seconds): {:4f}'.format(time.time() - start_time))
# shuffle-idx.
start_time = time.time()
# -1 is due to data structure used to retieve the index:
# sample i --> [sample_idx[i], sample_idx[i+1])
shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
print_rank_0(' > elasped time to build and save shuffle-idx mapping'
' (seconds): {:4f}'.format(time.time() - start_time))

# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts = torch.cuda.LongTensor([1])
torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
assert counts[0].item() == torch.distributed.get_world_size(
group=mpu.get_data_parallel_group())

# Load mappings.
start_time = time.time()
print_rank_0(' > loading doc-idx mapping from {}'.format(
doc_idx_filename))
doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
print_rank_0(' > loading sample-idx mapping from {}'.format(
sample_idx_filename))
sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
print_rank_0(' > loading shuffle-idx mapping from {}'.format(
shuffle_idx_filename))
shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
time.time() - start_time))
print_rank_0(' total number of samples: {}'.format(
sample_idx.shape[0]))
print_rank_0(' total number of epochs: {}'.format(num_epochs))

return doc_idx, sample_idx, shuffle_idx


def _num_tokens(documents, sizes):
"""Total number of tokens in the dataset."""
return np.sum(sizes[documents])


def _num_epochs(tokens_per_epoch, seq_length, num_samples):
"""Based on number of samples and sequence lenght, calculate how many
epochs will be needed."""
num_epochs = 0
total_tokens = 0
while True:
num_epochs += 1
total_tokens += tokens_per_epoch
# -1 is because we need to retrieve seq_length + 1 token each time
# but the last token will overlap with the first token of the next
# sample except for the last sample.
if ((total_tokens - 1) // seq_length) >= num_samples:
return num_epochs


def _build_doc_idx(documents, num_epochs, np_rng):
"""Build an array with length = number-of-epochs * number-of-dcuments.
Each index is mapped to a corresponding document."""
doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
doc_idx[:] = documents
doc_idx = doc_idx.reshape(-1)
doc_idx = doc_idx.astype(np.int32)
np_rng.shuffle(doc_idx)
return doc_idx


def _build_sample_idx(sizes, doc_idx, seq_length,
num_epochs, tokens_per_epoch):
"""Sample index mapping is a 2D array with sizes
[number-of-samples + 1, 2] where [..., 0] contains
the index into `doc_idx` and [..., 1] is the
starting offset in that document."""

# Total number of samples. For -1 see comments in `_num_epochs`.
num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)

# Index into sample_idx.
sample_index = 0
# Index into doc_idx.
doc_idx_index = 0
# Begining offset for each document.
doc_offset = 0
# Start with first document and no offset.
sample_idx[sample_index][0] = doc_idx_index
sample_idx[sample_index][1] = doc_offset
sample_index += 1
while sample_index <= num_samples:
# Start with a fresh sequence.
remaining_seq_length = seq_length + 1
while remaining_seq_length != 0:
# Get the document length.
doc_id = doc_idx[doc_idx_index]
doc_length = sizes[doc_id] - doc_offset
# And add it to the current sequence.
remaining_seq_length -= doc_length
# If we have more than a full sequence, adjust offset and set
# remaining length to zero so we return from the while loop.
# Note that -1 here is for the same reason we have -1 in
# `_num_epochs` calculations.
if remaining_seq_length <= 0:
doc_offset += (remaining_seq_length + doc_length - 1)
remaining_seq_length = 0
else:
# Otherwise, start from the begining of the next document.
doc_idx_index += 1
doc_offset = 0
# Record the sequence.
sample_idx[sample_index][0] = doc_idx_index
sample_idx[sample_index][1] = doc_offset
sample_index += 1

return sample_idx


def _build_shuffle_idx(size, np_rng):
"""Build the range [0, size) and shuffle."""
dtype_ = np.uint32
if size >= (np.iinfo(np.uint32).max - 1):
dtype_ = np.int64
shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
np_rng.shuffle(shuffle_idx)
return shuffle_idx

+ 120
- 0
nvidia-code/eval_gpt2.py View File

@@ -0,0 +1,120 @@
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Pretrain GPT2"""

import torch

from megatron import get_args
from megatron import print_rank_0
from megatron import get_timers
from megatron import get_tokenizer
from megatron import mpu
from megatron.data.gpt2_dataset_dev_eval import build_train_valid_test_datasets
from megatron.model import GPT2Model
from megatron.eval_ppl import pretrain
from megatron.utils import get_ltor_masks_and_position_ids
from megatron.utils import reduce_losses

def model_provider():
"""Build the model."""

print_rank_0('building GPT2 model ...')
model = GPT2Model(num_tokentypes=0, parallel_output=True)

return model


def get_batch(data_iterator):
"""Generate a batch"""
args = get_args()

# Items and their type.
keys = ['text']
datatype = torch.int64

# Broadcast data.
if data_iterator is not None:
data = next(data_iterator)
else:
data = None
data_b = mpu.broadcast_data(keys, data, datatype)

# Unpack.
tokens_ = data_b['text'].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()

# Get the masks and postition ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
30000,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss)

return tokens, labels, loss_mask, attention_mask, position_ids


def forward_step(data_iterator, model):
"""Forward step."""
args = get_args()
timers = get_timers()

# Get the batch.
timers('batch generator').start()
try:
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator)
except:
return 0, {'lm loss': 0.0}

timers('batch generator').stop()
# Forward model.
losses = model(tokens, position_ids, attention_mask, labels=labels)
loss_mask = loss_mask.view(-1)
## 按字符平均
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()

# Reduce loss for logging.求平均
reduced_loss = reduce_losses([loss])

return loss, {'lm loss': reduced_loss[0]}


def train_valid_test_datasets_provider(train_val_test_num_samples):
"""Build train, valid, and test datasets."""
args = get_args()

print_rank_0('> building train, validation, and test datasets '
'for GPT2 ...')
train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
train_valid_test_num_samples=train_val_test_num_samples,
seq_length=args.seq_length,
seed=args.seed,
skip_warmup=(not args.mmap_warmup))
print_rank_0("> finished creating GPT2 datasets ...")

return train_ds, valid_ds, test_ds


if __name__ == "__main__":

pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})

+ 57
- 0
nvidia-code/examples/.ipynb_checkpoints/pretrain_gpt2_distributed_2.6B-checkpoint.sh View File

@@ -0,0 +1,57 @@
#! /bin/bash

# Runs the "2.6B" parameter model

GPUS_PER_NODE=16
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=3333
NNODES=4
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$1))

#DATA_PATH=<Specify path and file prefix>_text_document
#CHECKPOINT_PATH=<Specify path>
CHECKPOINT_PATH=checkpoints/gpt2_64_2.6B
DATA_PATH=$4
VOCAB_FILE=bpe_3w_new/vocab.json
MERGE_FILE=bpe_3w_new/merges.txt

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $1 --node_rank $2 --master_addr $3 --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \
--model-parallel-size 16 \
--num-layers 32 \
--hidden-size 2560 \
--num-attention-heads 32 \
--batch-size 128 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 240000 \
--lr-decay-iters 153600 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 950,49,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--log-interval 1 \
--tensorboard-dir logs/gpt2_64_2.6B \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16 \



set +x

BIN
nvidia-code/examples/.pretrain_gpt2_distributed_2.6B.sh.swo View File


+ 62
- 0
nvidia-code/examples/debug_pretrain_gpt2_distributed_xxxM.sh View File

@@ -0,0 +1,62 @@
#! /bin/bash

# Runs the "xxxM" parameter model

GPUS_PER_NODE=16
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=0512
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$1))

#DATA_PATH=<Specify path and file prefix>_text_document
#CHECKPOINT_PATH=<Specify path>
CHECKPOINT_PATH=checkpoints/gpt2_64_xxxM
DATA_PATH=$4
VOCAB_FILE=bpe_3w_new/vocab.json
MERGE_FILE=bpe_3w_new/merges.txt

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $1 --node_rank $2 --master_addr $3 --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \
--model-parallel-size 2 \
--num-layers 54 \
--hidden-size 1920 \
--num-attention-heads 20 \
--batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 120000 \
--lr-decay-iters 76800 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 950,49,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--log-interval 50 \
--tensorboard-dir logs/gpt2_64_xxxxxxxxx \
--save-interval 10000 \
--eval-interval 100 \
--eval-iters 10 \
--checkpoint-num-layers 1 \
--fp16 \
--fp16-lm-cross-entropy \
--checkpoint-activations \
--distribute-checkpointed-activations \
--DDP-impl torch \
--use-cpu-initialization \



set +x

+ 64
- 0
nvidia-code/examples/evalPPL_gpt2_distributed.sh View File

@@ -0,0 +1,64 @@
#! /bin/bash

# Runs the "345M" parameter model

GPUS_PER_NODE=16
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=7989
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE))

#DATA_PATH=<Specify path and file prefix>_text_document
#CHECKPOINT_PATH=<Specify path>
CHECKPOINT_PATH=/userhome/jobs/Megatron-LM/datav1v2-tag-res/checkpoints/
VOCAB_FILE=bpe_3w_new/vocab.json
MERGE_FILE=bpe_3w_new/merges.txt

# TAG_DATA_NAME_LIST=('data-v1-v2-tag/dev/BAAI_JDDC')
# 'data-v1-v2-tag/dev/BaiDuQA' 'data-v1-v2-tag/dev/DuReader' 'data-v1-v2-tag/dev/Laisi-crmc-1' 'data-v1-v2-tag/dev/new2016zh' 'data-v1-v2-tag/dev/qianyan_chat' 'data-v1-v2-tag/dev/webtext2019zh' 'data-v1-v2-tag/dev/wiki_zh' 'data-v1-v2-notag/dev/baike_qa2019' 'data-v1-v2-tag/dev/THUCNews' 'data-v1-v2-tag/dev/CAIL2018_ALL_DATA' 'data-v1-v2-tag/dev/LCCC' 'data-v1-v2-tag/dev/ownthink_triples' 'data-v1-v2-tag/dev/Sogou-CA' 'data-v1-v2-tag/dev/NLP-FAQ')

# DATA_NAME_LIST=('BaiDuQA')

DATA_NAME_LIST=('tag/dev/BAAI-JDDC' 'tag/dev/BaiDuQA' 'tag/dev/DuReader' 'tag/dev/Laisi-crmc-1' 'tag/dev/new2016zh' 'tag/dev/qianyan_chat' 'tag/dev/webtext2019zh' 'tag/dev/wiki_zh' 'tag/dev/baike_qa2019' 'tag/dev/THUCNews' 'tag/dev/CAIL2018_ALL_DATA' 'tag/dev/LCCC' 'tag/dev/ownthink_triples' 'tag/dev/Sogou-CA' 'tag/dev/NLP-FAQ' 'notag/dev/BAAI-JDDC' 'notag/dev/BaiDuQA' 'notag/dev/DuReader' 'notag/dev/Laisi-crmc-1' 'notag/dev/new2016zh' 'notag/dev/qianyan_chat' 'notag/dev/webtext2019zh' 'notag/dev/wiki_zh' 'notag/dev/baike_qa2019' 'notag/dev/THUCNews' 'notag/dev/CAIL2018_ALL_DATA' 'notag/dev/LCCC' 'notag/dev/ownthink_triples' 'notag/dev/Sogou-CA' 'notag/dev/NLP-FAQ')

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

for i in ${DATA_NAME_LIST[*]};do
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
eval_gpt2.py \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 128 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 123 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path '/gdata/ChineseCorpus/Megatron-training/data-v1-v2-'${i}'text_document' \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 1000,0,0 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 1 \
--fp16
done



set +x

+ 38
- 0
nvidia-code/examples/evaluate_zeroshot_gpt2.sh View File

@@ -0,0 +1,38 @@
#!/bin/bash

WORLD_SIZE=8

DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"

TASK="LAMBADA"

VALID_DATA=<lambada path>
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
CHECKPOINT=checkpoints/gpt2_345m


python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task $TASK \
--valid-data $VALID_DATA \
--tokenizer-type GPT2BPETokenizer \
--strict-lambada \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--load $CHECKPOINT \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--checkpoint-activations \
--seq-length 1024 \
--max-position-embeddings 1024 \
--log-interval 10 \
--fp16 \
--no-load-optim \
--no-load-rng

+ 44
- 0
nvidia-code/examples/finetune_mnli_distributed.sh View File

@@ -0,0 +1,44 @@
#!/bin/bash

WORLD_SIZE=8

DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"

TRAIN_DATA="data/glue_data/MNLI/train.tsv"
VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
data/glue_data/MNLI/dev_mismatched.tsv"
PRETRAINED_CHECKPOINT=checkpoints/bert_345m
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m_mnli

python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task MNLI \
--seed 1234 \
--train-data $TRAIN_DATA \
--valid-data $VALID_DATA \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--epochs 5 \
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--checkpoint-activations \
--lr 5.0e-5 \
--lr-decay-style linear \
--warmup 0.065 \
--seq-length 512 \
--max-position-embeddings 512 \
--save-interval 500000 \
--save $CHECKPOINT_PATH \
--log-interval 10 \
--eval-interval 100 \
--eval-iters 50 \
--weight-decay 1.0e-1 \
--fp16

+ 47
- 0
nvidia-code/examples/finetune_race_distributed.sh View File

@@ -0,0 +1,47 @@
#!/bin/bash

WORLD_SIZE=8

DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"

TRAIN_DATA="data/RACE/train/middle"
VALID_DATA="data/RACE/dev/middle \
data/RACE/dev/high"
VOCAB_FILE=bert-vocab.txt
PRETRAINED_CHECKPOINT=checkpoints/bert_345m
CHECKPOINT_PATH=checkpoints/bert_345m_race

python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task RACE \
--seed 1234 \
--train-data $TRAIN_DATA \
--valid-data $VALID_DATA \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--epochs 3 \
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--checkpoint-activations \
--lr 1.0e-5 \
--lr-decay-style linear \
--warmup 0.06 \
--seq-length 512 \
--max-position-embeddings 512 \
--save-interval 100000 \
--save $CHECKPOINT_PATH \
--log-interval 10 \
--eval-interval 100 \
--eval-iters 50 \
--weight-decay 1.0e-1 \
--clip-grad 1.0 \
--hidden-dropout 0.1 \
--attention-dropout 0.1 \
--fp16

+ 25
- 0
nvidia-code/examples/generate_text.sh View File

@@ -0,0 +1,25 @@
#!/bin/bash

CHECKPOINT_PATH=checkpoints/gpt2_345m
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt

python tools/generate_samples_gpt2.py \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load $CHECKPOINT_PATH \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--batch-size 2 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile unconditional_samples.json \
--num-samples 2 \
--top_p 0.9 \
--recompute

+ 37
- 0
nvidia-code/examples/generate_text_cmrc2018.sh View File

@@ -0,0 +1,37 @@
#!/bin/bash

GPUS_PER_NODE=16
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=2018
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=checkpoints/
VOCAB_FILE=bpe_3w_new/vocab.json
MERGE_FILE=bpe_3w_new/merges.txt
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
tools/generate_samples_gpt2.py \
--model-parallel-size 4 \
--num-layers 32 \
--hidden-size 2560 \
--num-attention-heads 32 \
--load $CHECKPOINT_PATH \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--batch-size 2 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile unconditional_samples.json \
--num-samples 2 \
--top_p 0.9 \
--recompute



set +x

+ 18
- 0
nvidia-code/examples/merge_mp_bert.sh View File

@@ -0,0 +1,18 @@
#!/bin/bash

MODEL_PARALLEL_SIZE=2

VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m

WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
--model-type BERT \
--model-parallel-size $MODEL_PARALLEL_SIZE \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--load $CHECKPOINT_PATH

+ 35
- 0
nvidia-code/examples/pretrain_bert.sh View File

@@ -0,0 +1,35 @@
#!/bin/bash

RANK=0
WORLD_SIZE=1
DATA_PATH=<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH=<Specify path>

python pretrain_bert.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 2000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file bert-vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style linear \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16


+ 44
- 0
nvidia-code/examples/pretrain_bert_distributed.sh View File

@@ -0,0 +1,44 @@
#!/bin/bash

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

DATA_PATH=<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH=<Specify path>

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file bert-vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16

+ 43
- 0
nvidia-code/examples/pretrain_gpt2.sh View File

@@ -0,0 +1,43 @@
#! /bin/bash

# Runs the "345M" parameter model

RANK=0
WORLD_SIZE=1

DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>


python pretrain_gpt2.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16


set +x

+ 52
- 0
nvidia-code/examples/pretrain_gpt2_distributed.sh View File

@@ -0,0 +1,52 @@
#! /bin/bash

# Runs the "345M" parameter model

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16



set +x

+ 57
- 0
nvidia-code/examples/pretrain_gpt2_distributed_2.6B.sh View File

@@ -0,0 +1,57 @@
#! /bin/bash

# Runs the "2.6B" parameter model

GPUS_PER_NODE=16
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=3333
NNODES=4
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$1))

#DATA_PATH=<Specify path and file prefix>_text_document
#CHECKPOINT_PATH=<Specify path>
CHECKPOINT_PATH=checkpoints/gpt2_64_2.6B
DATA_PATH=$4
VOCAB_FILE=bpe_3w_new/vocab.json
MERGE_FILE=bpe_3w_new/merges.txt

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $1 --node_rank $2 --master_addr $3 --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \
--model-parallel-size 16 \
--num-layers 32 \
--hidden-size 2560 \
--num-attention-heads 32 \
--batch-size 128 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 240000 \
--lr-decay-iters 153600 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 950,49,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--log-interval 1 \
--tensorboard-dir logs/gpt2_64_2.6B \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16 \



set +x

+ 57
- 0
nvidia-code/examples/pretrain_gpt2_distributed_345M.sh View File

@@ -0,0 +1,57 @@
#! /bin/bash

# Runs the "345M" parameter model

GPUS_PER_NODE=16
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*NNODES))

#DATA_PATH=<Specify path and file prefix>_text_document
#CHECKPOINT_PATH=<Specify path>
CHECKPOINT_PATH=checkpoints/gpt2_345m
DATA_PATH=my-gpt2_text_document
VOCAB_FILE=bpe_3w_new/vocab.json
MERGE_FILE=bpe_3w_new/merges.txt

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 60000 \
--lr-decay-iters 128000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 950,49,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--log-interval 100 \
--tensorboard-dir logs/gpt2_345m \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16



set +x

+ 142
- 0
nvidia-code/get_ib_throughput.sh View File

@@ -0,0 +1,142 @@
#!/bin/bash
#
# Show live throughput for InfiniBand interfaces based on data counters in /sys/class/infiniband.
#
# Author: Sven Breuner
# Maintainer: Sven Breuner <sven[at]excelero.com>


# Print usage info and exit
usage()
{
echo "Show live throughput for InfiniBand interfaces based on data counters in"
echo "/sys/class/infiniband."
echo
echo "Usage:"
echo " $0 [Options] [Interface:Port [Interface:Port [...] ]"
echo
echo "Options:"
echo " -m Use multiple lines per interface, one for rx and one for tx."
echo " -s Single run. Default is an infinite loop."
echo " -t NUM Time interval in seconds. Defaults to 1 sec."
echo " interface Interface to query counters for. Defaults to automatic query"
echo " of available interfaces based on ibdev2netdev."
echo
echo "Example:"
echo " $ $0 mlx5_0:1 mlx5_1:1"

exit 1
}

# Parse command line arguments and set defaults
parse_args()
{
local OPTIND # local to prevent effects from other subscripts

# default settings
multiline=0 # 1 for multiple lines for rx and tx per interface
quit=0 # 1 for single run instead of infinite loop
t=1 # time interval in seconds

while getopts ":hmst:" opt; do
case "${opt}" in
m)
# Single run instead of infinite loop
multiline=1
;;
s)
# Single run instead of infinite loop
quit=1
;;
t)
# Time interval in seconds
t=${OPTARG}
;;
*)
# Other option arguments are invalid
usage
;;
esac
done

shift $((OPTIND-1))

# Non-option arguments are assumed to be interface names
NICs=($*)

# If no interfaces were given by user then auto detect from ibdev2netdev
# (We ignore "Up/Down" in ibdev2netdev output, because that's only for IPoIB.)
if [ ${#NICs[@]} -eq 0 ]; then
NICs=(`ibdev2netdev | cut -d '=' -f 1 | cut -d ' ' -f 1,3 | sed 's/ /:/'`)
fi
}


# Print statistics for given time interval. (Includes sleep.)
print_stats()
{
# read absolute rx/tx counters of given interfaces (reuse from last round if possible)
if [ ${#STATS_B[@]} -eq 0 ]; then
for (( i=0; i < ${#NICs[@]}; i++ )) do
dev=$(echo ${NICs[$i]} | cut -d: -f1)
port=$(echo ${NICs[$i]} | cut -d: -f2)

STATS_A[$i]=$(cat /sys/class/infiniband/${dev}/ports/${port}/counters/port_rcv_data)
STATS_A[$i]+=" "
STATS_A[$i]+=$(cat /sys/class/infiniband/${dev}/ports/${port}/counters/port_xmit_data)
# STAT_A[$i] contains 2 space-separated values now: the absolute rx bytes and tx bytes
done
else
# reuse counters from last round
STATS_A=("${STATS_B[@]}")
fi

# wait for given time interval (seconds)
sleep $t;

# read absolute rx/tx counters again and print difference
for (( i=0; i < ${#NICs[@]}; i++ )) do
dev=$(echo ${NICs[$i]} | cut -d: -f1)
port=$(echo ${NICs[$i]} | cut -d: -f2)

STATS_B[$i]=$(cat /sys/class/infiniband/${dev}/ports/${port}/counters/port_rcv_data)
STATS_B[$i]+=" "
STATS_B[$i]+=$(cat /sys/class/infiniband/${dev}/ports/${port}/counters/port_xmit_data)

A=(${STATS_A[$i]})
B=(${STATS_B[$i]})

# port_*_data counters text from IB standard doc: "Total number of data octets, divided by 4,
# transmitted on all VLs during the sampling interval."
# Note that the 4 here is fixed, independent of the "active_width" of a link.
rate_multiplier=4

RX=$(( $rate_multiplier * (${B[0]}-${A[0]}) / (1024 * 1024 * $t) ))
TX=$(( $rate_multiplier * (${B[1]}-${A[1]}) / (1024 * 1024 * $t) ))

if [ $multiline -gt 0 ]; then
echo "${NICs[$i]} RX MiB/s: $RX"
echo "${NICs[$i]} TX MiB/s: $TX"
else
printf '%-8s | MiB/s | RX %6s | TX %6s\n' ${NICs[$i]} $RX $TX
fi

done
}


parse_args "$@"

SECONDS=0 # automatically incremented by bash

while true; do
stats_out=$(print_stats)

if [ $quit -gt 0 ]; then
echo "$stats_out"
exit 0
fi

echo "--- ${SECONDS}s ---"
echo "$stats_out"
done

+ 13
- 0
nvidia-code/ib_speed_stat.sh View File

@@ -0,0 +1,13 @@

#!/bin/bash
while :
do
ibdata=`perfquery -r | awk -F"." '/XmitData/{print $NF}'`

echo $ibdata
ibdatam=$(( $ibdata * 4 / 1024 / 1024 ))
echo $ibdatam MB

sleep 1
done


+ 12
- 0
nvidia-code/images/Makefile View File

@@ -0,0 +1,12 @@
default: cases.png scaling-mp.png scaling-dp.png

# for some reason the size option to convert in scaling.tex doesn't work, manually do it after
cases.png scaling-mp.png scaling-dp.png: tables.tex
latex --shell-escape $<
convert tables-1.png -resize 650 cases.png
convert tables-2.png -resize 600 scaling-mp.png
convert tables-3.png -resize 350 scaling-dp.png

clean:
rm -rf *.aux *.log *.dvi *.ps
rm -rf tables-*.png

BIN
nvidia-code/images/cases.png View File

Before After
Width: 650  |  Height: 88  |  Size: 12 KiB

BIN
nvidia-code/images/scaling-dp.png View File

Before After
Width: 350  |  Height: 116  |  Size: 13 KiB

BIN
nvidia-code/images/scaling-mp.png View File

Before After
Width: 600  |  Height: 121  |  Size: 22 KiB

+ 40
- 0
nvidia-code/images/tables.tex View File

@@ -0,0 +1,40 @@
\documentclass[multi,convert]{standalone}
\usepackage{multirow}
\standaloneenv{tabular}

\begin{document}

\begin{tabular}{cccccc}
Case & Hidden Size & Attention Heads & Layers & Parameters (billions) & Model Parallel Partitions \\
\hline
1B & 1920 & 15 & 24 & 1.16 & 1 \\
2B & 2304 & 18 & 30 & 2.03 & 2 \\
4B & 3072 & 24 & 36 & 4.24 & 4 \\
8B & 4096 & 32 & 42 & 8.67 & 8 \\
\end{tabular}

\begin{tabular}{cc|ccc|ccc}
& & \multicolumn{3}{c|}{\textbf{DGX-2 (V100) batch size 8}} & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 16}} \\
\hline
\multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
& GPUs & Time (ms) & & per GPU & Time (ms) & & per GPU \\
\hline
1B & 1 & 1121 & 100.0\% & 71.9 & 1076 & 100\% & 149.8 \\
2B & 2 & 1093 & 89.6\% & 64.2 & 1026 & 91.7\% & 136.8 \\
4B & 4 & 1238 & 82.5\% & 58.5 & 1162 & 84.5\% & 124.7 \\
8B & 8 & 1407 & 74.3\% & 52.2 & 1343 & 74.7\% & 109.3 \\
\end{tabular}

\begin{tabular}{cc|ccc}
& & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 2048}} \\
\hline
\multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
& GPUs & Time (ms) & & per GPU \\
\hline
1B & 128 & 1153 & 93.3\% & 139.8 \\
2B & 256 & 1101 & 85.5\% & 127.5 \\
4B & 512 & 1242 & 79.0\% & 116.7 \\
8B & 1024 & 1380 & 72.7\% & 106.5 \\
\end{tabular}

\end{document}

+ 611
- 0
nvidia-code/megatron/.ipynb_checkpoints/training-checkpoint.py View File

@@ -0,0 +1,611 @@
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Pretrain utilities."""

from datetime import datetime
import math
import sys
import torch
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
from apex.optimizers import FusedAdam as Adam

from megatron import get_args
from megatron import get_timers
from megatron import get_tensorboard_writer
from megatron import mpu
from megatron import print_rank_0
from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint
from megatron.fp16 import FP16_Module
from megatron.fp16 import FP16_Optimizer
from megatron.initialize import initialize_megatron
from megatron.learning_rates import AnnealingLR
from megatron.model import DistributedDataParallel as LocalDDP
from megatron.model import get_params_for_weight_decay_optimization
from megatron.model.realm_model import ICTBertModel
from megatron.utils import check_adlr_autoresume_termination
from megatron.utils import make_data_loader
from megatron.utils import report_memory
import pynvml
pynvml.nvmlInit()


def check_cuda(rank=0):

cuda_index = rank%16
handle = pynvml.nvmlDeviceGetHandleByIndex(cuda_index)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
print_rank_0(f"cuda_ID={cuda_index},cuda_used={meminfo.used}")


def pretrain(train_valid_test_dataset_provider, model_provider,
forward_step_func, extra_args_provider=None, args_defaults={}):
"""Main training program.

This function will run the followings in the order provided:
1) initialize Megatron.
2) setup model, optimizer and lr schedule using the model_provider.
3) call train_val_test_data_provider to get train/val/test datasets.
4) train the modle using the forward_step_func.

Arguments:
train_valid_test_dataset_provider: a function that takes the size of
train/valid/test dataset and returns `train, valid, test` datasets.
model_provider: a function that returns a vanilla version of the
model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
forward_step_func: a function that takes a `data iterator` and `model`,
and returns a `loss` scalar with a dictionary with key:values being
the info we would like to monitor during training, for example
`lm-loss: value`. We also require that this function add
`batch generator` to the timers class.
extra_args_provider: a function that takes a parser and adds arguments
to it. It is used for programs to add their own arguments.
args_defaults: a dictionary from argument-name to argument-value. It
to set already parse arguments.
"""

# Initalize and get arguments, timers, and Tensorboard writer.
initialize_megatron(extra_args_provider=extra_args_provider,
args_defaults=args_defaults)

args = get_args()
timers = get_timers()

# Model, optimizer, and learning rate.
timers('model and optimizer').start()
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
timers('model and optimizer').stop()
print_rank_0("model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)")
check_cuda()
# Data stuff.
timers('train/valid/test data iterators').start()
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_dataset_provider)
timers('train/valid/test data iterators').stop()

# Print setup timing.
print_rank_0('done with setups ...')
timers.log(['model and optimizer', 'train/valid/test data iterators'])
print_rank_0('training ...')

iteration = 0
if args.do_train and args.train_iters > 0:
iteration = train(forward_step_func,
model, optimizer, lr_scheduler,
train_data_iterator, valid_data_iterator)

if args.do_valid:
prefix = 'the end of training for val data'
evaluate_and_print_results(prefix, forward_step_func,
valid_data_iterator, model,
iteration, False)

if args.save and iteration != 0:
save_checkpoint(iteration, model, optimizer, lr_scheduler)

if args.do_test:
# Run on test data.
prefix = 'the end of training for test data'
evaluate_and_print_results(prefix, forward_step_func,
test_data_iterator, model,
0, True)


def get_model(model_provider_func):
"""Build the model."""
args = get_args()

# Build model on cpu.
model = model_provider_func()

# Print number of parameters.
if mpu.get_data_parallel_rank() == 0:
print(' > number of parameters on model parallel rank {}: {}'.format(
mpu.get_model_parallel_rank(),
sum([p.nelement() for p in model.parameters()])), flush=True)

# GPU allocation.
model.cuda(torch.cuda.current_device())
print_rank_0("training.py : model.cuda(torch.cuda.current_device())")
check_cuda()
# Fp16 conversion.
if args.fp16:
model = FP16_Module(model)
print_rank_0("training.py : model = FP16_Module(model)")
check_cuda()
# Wrap model for distributed training."""
if args.DDP_impl == 'torch':
i = torch.cuda.current_device()
model = torchDDP(model, device_ids=[i], output_device=i,
process_group=mpu.get_data_parallel_group())
return model
if args.DDP_impl == 'local':
model = LocalDDP(model)
return model

raise NotImplementedError('Unknown DDP implementation specified: {}. '
'Exiting.'.format(args.DDP_impl))


def get_optimizer(model):
"""Set up the optimizer."""
args = get_args()

# Build parameter groups (weight decay and non-decay).
while isinstance(model, (torchDDP, LocalDDP, FP16_Module)):
model = model.module
param_groups = get_params_for_weight_decay_optimization(model)

# Add model parallel attribute if it is not set.
for param_group in param_groups:
for param in param_group['params']:
if not hasattr(param, 'model_parallel'):
param.model_parallel = False

# Use Adam.
optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay,
betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps)

# Wrap into fp16 optimizer.
if args.fp16:
optimizer = FP16_Optimizer(optimizer,
static_loss_scale=args.loss_scale,
dynamic_loss_scale=args.dynamic_loss_scale,
dynamic_loss_args={
'scale_window': args.loss_scale_window,
'min_scale': args.min_scale,
'delayed_shift': args.hysteresis})

return optimizer


def get_learning_rate_scheduler(optimizer):
"""Build the learning rate scheduler."""
args = get_args()

# Add linear learning rate scheduler.
if args.lr_decay_iters is not None:
num_iters = args.lr_decay_iters
else:
num_iters = args.train_iters
num_iters = max(1, num_iters)
init_step = 0
warmup_iter = args.warmup * num_iters
lr_scheduler = AnnealingLR(
optimizer,
start_lr=args.lr,
warmup_iter=warmup_iter,
total_iters=num_iters,
decay_style=args.lr_decay_style,
last_iter=init_step,
min_lr=args.min_lr,
use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
override_lr_scheduler=args.override_lr_scheduler)

return lr_scheduler


def setup_model_and_optimizer(model_provider_func):
"""Setup model and optimizer."""
args = get_args()

model = get_model(model_provider_func)
optimizer = get_optimizer(model)
lr_scheduler = get_learning_rate_scheduler(optimizer)

if args.load is not None:
args.iteration = load_checkpoint(model, optimizer, lr_scheduler)
else:
args.iteration = 0

# get model without FP16 and/or TorchDDP wrappers
unwrapped_model = model
while hasattr(unwrapped_model, 'module'):
unwrapped_model = unwrapped_model.module

if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'):
print("Initializing ICT from pretrained BERT model", flush=True)
unwrapped_model.init_state_dict_from_bert()

return model, optimizer, lr_scheduler


def backward_step(optimizer, model, loss):
"""Backward step."""
args = get_args()
timers = get_timers()

# Backward pass.
timers('backward-backward').start()
optimizer.zero_grad(set_grads_to_None=True)
if args.fp16:
optimizer.backward(loss, update_master_grads=False)
else:
loss.backward()
timers('backward-backward').stop()

# All-reduce if needed.
if args.DDP_impl == 'local':
timers('backward-allreduce').start()
model.allreduce_params(reduce_after=False,
fp32_allreduce=args.fp32_allreduce)
timers('backward-allreduce').stop()

# Update master gradients.
timers('backward-master-grad').start()
if args.fp16:
optimizer.update_master_grads()
timers('backward-master-grad').stop()

# Clipping gradients helps prevent the exploding gradient.
timers('backward-clip-grad').start()
if args.clip_grad > 0:
if not args.fp16:
mpu.clip_grad_norm(model.parameters(), args.clip_grad)
else:
optimizer.clip_master_grads(args.clip_grad)
timers('backward-clip-grad').stop()


def train_step(forward_step_func, data_iterator,
model, optimizer, lr_scheduler):
"""Single training step."""
args = get_args()
timers = get_timers()

# Forward model for one step.
timers('forward').start()
loss, loss_reduced = forward_step_func(data_iterator, model)
timers('forward').stop()
print_rank_0("loss, loss_reduced = forward_step_func(data_iterator, model)")
check_cuda()
# Calculate gradients, reduce across processes, and clip.
timers('backward').start()
backward_step(optimizer, model, loss)
timers('backward').stop()
print_rank_0("backward_step(optimizer, model, loss)")
check_cuda()
# Update parameters.
timers('optimizer').start()
optimizer.step()
timers('optimizer').stop()
print_rank_0("optimizer.step()")
check_cuda()
# Update learning rate.
skipped_iter = 0
if not (args.fp16 and optimizer.overflow):
lr_scheduler.step()
else:
skipped_iter = 1

return loss_reduced, skipped_iter


def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
loss_scale, report_memory_flag, skipped_iter):
"""Log training information such as losses, timing, ...."""
args = get_args()
timers = get_timers()
writer = get_tensorboard_writer()

# Update losses.
skipped_iters_key = 'skipped iterations'
total_loss_dict[skipped_iters_key] = total_loss_dict.get(
skipped_iters_key, 0) + skipped_iter
got_nan_key = 'got nan'

got_nan = False
for key in loss_dict:
if not skipped_iter:
total_loss_dict[key] = total_loss_dict.get(
key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
else:
value = loss_dict[key].float().sum().item()
is_nan = value == float('inf') or \
value == -float('inf') or \
value != value
got_nan = got_nan or is_nan

total_loss_dict[got_nan_key] = total_loss_dict.get(
got_nan_key, 0) + int(got_nan)

# Logging.
timers_to_log = []

def add_to_logging(name):
if name in timers.timers:
timers_to_log.append(name)
add_to_logging('forward')
add_to_logging('backward')
add_to_logging('backward-backward')
add_to_logging('backward-allreduce')
add_to_logging('backward-master-grad')
add_to_logging('backward-clip-grad')
add_to_logging('optimizer')
add_to_logging('batch generator')

# Tensorboard values.
if writer and torch.distributed.get_rank() == 0:
writer.add_scalar('learning_rate', learning_rate, iteration)
for key in loss_dict:
writer.add_scalar(key, loss_dict[key], iteration)
if args.fp16:
writer.add_scalar('loss_scale', loss_scale, iteration)
normalizer = iteration % args.log_interval
if normalizer == 0:
normalizer = args.log_interval
timers.write(timers_to_log, writer, iteration,
normalizer=normalizer)

if iteration % args.log_interval == 0:
elapsed_time = timers('interval time').elapsed()
if writer and torch.distributed.get_rank() == 0:
writer.add_scalar('iteration_time',
elapsed_time / args.log_interval, iteration)
log_string = ' iteration {:8d}/{:8d} |'.format(iteration,
args.train_iters)
log_string += ' elapsed time per iteration (ms): {:.1f} |'.format(
elapsed_time * 1000.0 / args.log_interval)
log_string += ' learning rate: {:.3E} |'.format(learning_rate)
num_iterations = max(
1, args.log_interval - total_loss_dict[skipped_iters_key])
for key in total_loss_dict:
if key not in [skipped_iters_key, got_nan_key]:
avg = total_loss_dict[key].item() / float(num_iterations)
if avg > 0.0:
log_string += ' {}: {:.6E} |'.format(key, avg)
total_loss_dict[key] = torch.cuda.FloatTensor([0.0])
if args.fp16:
log_string += ' loss scale: {:.1f} |'.format(loss_scale)
log_string += ' number of skipped iterations: {:3d} |'.format(
total_loss_dict[skipped_iters_key])
log_string += ' number of nan iterations: {:3d} |'.format(
total_loss_dict[got_nan_key])
total_loss_dict[skipped_iters_key] = 0
total_loss_dict[got_nan_key] = 0
print_rank_0(log_string)
if report_memory_flag:
report_memory('after {} iterations'.format(iteration))
report_memory_flag = False
timers.log(timers_to_log, normalizer=args.log_interval)

return report_memory_flag


def train(forward_step_func, model, optimizer, lr_scheduler,
train_data_iterator, valid_data_iterator):
"""Train the model function."""
args = get_args()
timers = get_timers()

# Turn on training mode which enables dropout.
model.train()

# Tracking loss.
total_loss_dict = {}

# Iterations.
iteration = args.iteration

timers('interval time').start()
report_memory_flag = True
while iteration < args.train_iters:
loss_dict, skipped_iter = train_step(forward_step_func,
train_data_iterator,
model,
optimizer,
lr_scheduler)
iteration += 1

# Logging.
loss_scale = None
if args.fp16:
loss_scale = optimizer.loss_scale
report_memory_flag = training_log(loss_dict, total_loss_dict,
optimizer.param_groups[0]['lr'],
iteration, loss_scale,
report_memory_flag, skipped_iter)
report_memory_flag = True
# Autoresume
if args.adlr_autoresume and \
(iteration % args.adlr_autoresume_interval == 0):
check_adlr_autoresume_termination(iteration, model, optimizer,
lr_scheduler)

# Checkpointing
if args.save and args.save_interval and \
iteration % args.save_interval == 0:
save_checkpoint(iteration, model, optimizer, lr_scheduler)

# Evaluation
if args.eval_interval and iteration % args.eval_interval == 0 and \
args.do_valid:
prefix = 'iteration {}'.format(iteration)
evaluate_and_print_results(prefix, forward_step_func,
valid_data_iterator, model,
iteration, False)

if args.exit_interval and iteration % args.exit_interval == 0:
torch.distributed.barrier()
time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
rank = torch.distributed.get_rank()
print_rank_0('rank: {} | time: {} | exiting the program at '
'iteration {}'.format(rank, time_str, iteration))
sys.exit()

return iteration


def evaluate(forward_step_func, data_iterator, model, verbose=False):
"""Evaluation."""
args = get_args()

# Turn on evaluation mode which disables dropout.
model.eval()

total_loss_dict = {}

with torch.no_grad():
iteration = 0
while iteration < args.eval_iters:
iteration += 1
if verbose and iteration % args.log_interval == 0:
print_rank_0('Evaluating iter {}/{}'.format(iteration,
args.eval_iters))
# Forward evaluation.
_, loss_dict = forward_step_func(data_iterator, model)
# Reduce across processes.
for key in loss_dict:
total_loss_dict[key] = total_loss_dict.get(key, 0.) + \
loss_dict[key]
# Move model back to the train mode.
model.train()

for key in total_loss_dict:
total_loss_dict[key] /= args.eval_iters

return total_loss_dict


def evaluate_and_print_results(prefix, forward_step_func,
data_iterator, model,
iteration, verbose=False):
"""Helper function to evaluate and dump results on screen."""
writer = get_tensorboard_writer()

total_loss_dict = evaluate(forward_step_func, data_iterator, model, verbose)
string = ' validation loss at {} | '.format(prefix)
for key in total_loss_dict:
string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
ppl = math.exp(min(20, total_loss_dict[key].item()))
string += '{} PPL: {:.6E} | '.format(key, ppl)
if writer and torch.distributed.get_rank() == 0:
writer.add_scalar('{} value'.format(key),
total_loss_dict[key].item(),
iteration)
writer.add_scalar('{} ppl'.format(key), ppl, iteration)

length = len(string) + 1
print_rank_0('-' * length)
print_rank_0(string)
print_rank_0('-' * length)


def build_train_valid_test_data_iterators(
build_train_valid_test_datasets_provider):
"""XXX"""
args = get_args()

(train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)

print_rank_0('> building train, validation, and test datasets ...')
# Data loader only on rank 0 of each model parallel group.
if mpu.get_model_parallel_rank() == 0:
# Rank, size, and global batch size.
data_parallel_size = mpu.get_data_parallel_world_size()
global_batch_size = args.batch_size * data_parallel_size

# Number of train/valid/test samples.
train_iters = args.train_iters
eval_iters = (train_iters // args.eval_interval + 1) * args.eval_iters
test_iters = args.eval_iters
train_val_test_num_samples = [train_iters * global_batch_size,
eval_iters * global_batch_size,
test_iters * global_batch_size]
print_rank_0(' > datasets target sizes (minimum size):')
print_rank_0(' train: {}'.format(train_val_test_num_samples[0]))
print_rank_0(' validation: {}'.format(train_val_test_num_samples[1]))
print_rank_0(' test: {}'.format(train_val_test_num_samples[2]))

# Build the datasets.
train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider(
train_val_test_num_samples)

# Build dataloders.
train_dataloader = make_data_loader(train_ds)
valid_dataloader = make_data_loader(valid_ds)
test_dataloader = make_data_loader(test_ds)

# Flags to know if we need to do training/validation/testing.
do_train = train_dataloader is not None and args.train_iters > 0
do_valid = valid_dataloader is not None and args.eval_iters > 0
do_test = test_dataloader is not None and args.eval_iters > 0
# Need to broadcast num_tokens and num_type_tokens.
flags = torch.cuda.LongTensor(
[int(do_train), int(do_valid), int(do_test)])
else:
flags = torch.cuda.LongTensor([0, 0, 0])

# Broadcast num tokens.
torch.distributed.broadcast(flags,
mpu.get_model_parallel_src_rank(),
group=mpu.get_model_parallel_group())
args.do_train = flags[0].item()
args.do_valid = flags[1].item()
args.do_test = flags[2].item()

# Shift the start iterations.
if train_dataloader is not None:
train_dataloader.batch_sampler.start_iter = args.iteration % \
len(train_dataloader)
print_rank_0('setting training data start iteration to {}'.
format(train_dataloader.batch_sampler.start_iter))
if valid_dataloader is not None:
start_iter_val = (args.iteration // args.eval_interval) * \
args.eval_iters
valid_dataloader.batch_sampler.start_iter = start_iter_val % \
len(valid_dataloader)
print_rank_0('setting validation data start iteration to {}'.
format(valid_dataloader.batch_sampler.start_iter))

# Build iterators.
if train_dataloader is not None:
train_data_iterator = iter(train_dataloader)
else:
train_data_iterator = None

if valid_dataloader is not None:
valid_data_iterator = iter(valid_dataloader)