First of all, thank you for developing Accelerate! I'm new to it but I already love it, it's a great framework.
I used the code below to train a naive model on MNIST data using 3 GPUs (on a single node/machine). This code uses an Accuracy
class to compute the epoch-wise accuracy from predictions and labels. It stores predictions/labels at each step to eventually compute accuracy (or another metric, such as ROC AUC) once at the end of an epoch. At each step, accelerator.gather()
is used to gather all predictions/labels from GPU devices. I added a print
statement in the Accuracy
class to check the number of samples used to compute the accuracy. The test set of MNIST is composed of 10 000 samples but the print statement in this class shows that 10 167 samples are actually used.
Full code:
from __future__ import print_function
import argparse
import os
import os.path
import threading
from functools import partial
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from accelerate import Accelerator
from sklearn.metrics import accuracy_score
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm.auto import tqdm as original_tqdm
class Accuracy:
"""Accuracy score."""
def __init__(self):
super().__init__()
self.__build()
def __build(self):
self._lock = threading.Lock()
self._predictions = []
self._targets = []
def reset(self):
self._predictions.clear()
self._targets.clear()
def update(self, output):
y_pred, y_true = output
with self._lock:
self._predictions.append(y_pred)
self._targets.append(y_true)
def compute(self):
with self._lock:
predictions = torch.cat(self._predictions, dim=0).numpy()
targets = torch.cat(self._targets, dim=0).numpy()
print(f'Shapes: predictions {predictions.shape}, targets {targets.shape}')
return accuracy_score(y_true=targets, y_pred=predictions)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout2d(0.25)
self.dropout2 = nn.Dropout2d(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--per_device_eval_batch_size',
type=int,
default=64,
metavar='N',
help='The per-device batch size to use for evaluation.')
parser.add_argument('--per_device_train_batch_size',
type=int,
default=64,
metavar='N',
help='The per-device batch size to use for training.')
parser.add_argument('--epochs',
type=int,
default=5,
metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr',
type=float,
default=1.0,
metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma',
type=float,
default=0.7,
metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda',
action='store_true',
default=False,
help='disables CUDA training')
parser.add_argument('--seed',
type=int,
default=1,
metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval',
type=int,
default=10,
metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--out_dir',
type=str,
help='Path where the trained model will be saved (if not None).')
args = parser.parse_args()
torch.manual_seed(args.seed)
accelerator = Accelerator()
_is_local_main_process = accelerator.is_local_main_process
tqdm = partial(original_tqdm, disable=not _is_local_main_process, position=0)
use_cuda = not args.no_cuda and torch.cuda.is_available()
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
# TRAIN AND TEST DATASETS/DATALOADERS
train_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
])
test_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
])
with accelerator.main_process_first():
# We only want to download MNIST data on rank-0
train_dataset = datasets.MNIST(os.environ['DSDIR'],
train=True,
download=True,
transform=train_transforms)
print(f'Length of training dataset: {len(train_dataset)}')
test_dataset = datasets.MNIST(os.environ['DSDIR'],
download=True,
train=False,
transform=test_transforms)
print(f'Length of test dataset: {len(test_dataset)}')
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=args.per_device_train_batch_size,
shuffle=True,
**kwargs)
test_dataloader = DataLoader(dataset=test_dataset,
batch_size=args.per_device_eval_batch_size,
shuffle=True,
**kwargs)
model = Net().to(accelerator.device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(model, optimizer, train_dataloader, test_dataloader)
def evaluate(_model, _device, _test_loader, _epoch):
_model.eval()
test_losses = []
test_accuracy = Accuracy()
example_images = []
with torch.no_grad():
for data, target in tqdm(_test_loader, desc=f'eval (epoch {_epoch:03d})'):
data, target = data.to(_device), target.to(_device)
output = _model(data)
loss = F.nll_loss(output, target, reduction='sum')
test_losses.append(accelerator.gather(loss))
preds = output.argmax(dim=1, keepdim=True)
test_accuracy.update((accelerator.gather(preds).detach().cpu(),
accelerator.gather(target).detach().cpu()))
test_loss = torch.sum(torch.cat(test_losses)) / len(_test_loader.dataset)
test_acc = test_accuracy.compute()
test_accuracy.reset()
return test_acc
def train_one_epoch(_args, _model, _device, _train_loader, _optimizer, _epoch):
_model.train()
for step, batch in enumerate(tqdm(_train_loader, desc=f'train (epoch {_epoch:03d})')):
data, target = batch
data, target = data.to(_device), target.to(_device)
_optimizer.zero_grad()
output = _model(data)
loss = F.nll_loss(output, target)
accelerator.backward(loss)
_optimizer.step()
# TRAINING
for epoch in range(1, args.epochs + 1):
train_one_epoch(args, model, accelerator.device, train_dataloader, optimizer, epoch)
eval_accuracy = evaluate(model, accelerator.device, test_dataloader, epoch)
if _is_local_main_process:
print(f'Epoch {epoch:02d} / Eval accuracy = {eval_accuracy}')
scheduler.step()
# SAVE TRAINED MODEL (OPTIONAL)
if _is_local_main_process and args.out_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save(args.out_dir, save_function=accelerator.save)
if __name__ == '__main__':
main()
Print statement after the 1st epoch:
Shapes: predictions (10176, 1), targets (10176,)
Why is there a mismatch with the expected number of samples (10 000) and the actual number of samples (10 176)?