Skip to content

MNIST with TensorFlow using MPI through Horovod

This toy-example shows how to do distributed training using TensorFlow and Horovod. Since Jean-Zay nodes are connected using MPI and with NCCL, synchronous training is theoretically well scalable.

Submit the job

A few things

Look at the slurm script, tf_mpi_mnist.job. There is a few important things:

  • #SBATCH --partition=gpu_p1: We use the gpu_p1 partition, which has nodes with 4 GPUs each,
  • #SBATCH --ntasks-per-node=4: We set 4 MPI tasks per node: one task per GPU,
  • #SBATCH --cpus-per-task=10: Since nodes have 40 CPUs each, we have to ask slurm to allocate 1/4 of the node resources (i.e. 10)
  • #SBATCH --ntasks=32: We ask for a total of 32 MPI tasks. Since we have 4 tasks per node, this means that we will use 8 nodes.

Enjoy

cd jean-zay-doc/docs/examples/tf/tf_mpi
sbatch tf_mpi_mnist.job

Code

The code for the distributed training (tf_mpi_mnist.py):

# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import os
import errno
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np
import argparse

from tensorflow import keras

import gzip
from os.path import join

dir_path = join(os.environ['DSDIR'],'MNIST','raw')

filename = [
    ["training_images",join(dir_path, "train-images-idx3-ubyte.gz")],
    ["test_images",join(dir_path, "t10k-images-idx3-ubyte.gz")],
    ["training_labels",join(dir_path, "train-labels-idx1-ubyte.gz")],
    ["test_labels",join(dir_path, "t10k-labels-idx1-ubyte.gz")]
    ]

layers = tf.layers

tf.logging.set_verbosity(tf.logging.INFO)

# Training settings
parser = argparse.ArgumentParser(description='Tensorflow MNIST Example')
parser.add_argument('--mnist', help='location of mnist.npz', required=True)
args = parser.parse_args()

def load_mnist():
    mnist = {}
    for name in filename[:2]:
        with gzip.open(name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28*28)
    for name in filename[-2:]:
        with gzip.open(name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)
    return  mnist

def conv_model(feature, target, mode):
    """2-layer convolution model."""
    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
    # with a on-value of 1 for each one-hot vector of length 10.
    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)

    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
    # image width and height final dimension being the number of color channels.
    feature = tf.reshape(feature, [-1, 28, 28, 1])

    # First conv layer will compute 32 features for each 5x5 patch
    with tf.variable_scope('conv_layer1'):
        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool1 = tf.nn.max_pool(
            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

    # Second conv layer will compute 64 features for each 5x5 patch.
    with tf.variable_scope('conv_layer2'):
        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool2 = tf.nn.max_pool(
            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # reshape tensor into a batch of vectors
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

    # Densely connected layer with 1024 neurons.
    h_fc1 = layers.dropout(
        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Compute logits (1 per class) and compute loss.
    logits = layers.dense(h_fc1, 10, activation=None)
    loss = tf.losses.softmax_cross_entropy(target, logits)

    return tf.argmax(logits, 1), loss


def train_input_generator(x_train, y_train, batch_size=64):
    assert len(x_train) == len(y_train)
    while True:
        p = np.random.permutation(len(x_train))
        x_train, y_train = x_train[p], y_train[p]
        index = 0
        while index <= len(x_train) - batch_size:
            yield x_train[index:index + batch_size], \
                  y_train[index:index + batch_size],
            index += batch_size


def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Load MNIST dataset.
    dataset=load_mnist()
    x_train = dataset[filename[0][0]].reshape((60000, 28, 28))
    y_train = dataset[filename[2][0]].reshape((60000))
    x_test = dataset[filename[1][0]].reshape((10000, 28, 28))
    y_test = dataset[filename[3][0]].reshape((10000))

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on lr_scaler.
    lr_scaler = hvd.size()
    opt = tf.train.AdamOptimizer(0.001 * lr_scaler)

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train, batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})


if __name__ == "__main__":
    tf.app.run()

and the script to launch the job:

#!/bin/bash
# This job will run 32 MPI process on 8 nodes, each node will host 4 MPI process, each one pinned on a GPU.
#SBATCH --job-name=mnist_tf_mpi     # job name
#SBATCH --partition=gpu_p1          # nodes on gpu_p1 have 4 GPUs each
#SBATCH --ntasks=32                 # number of MPI task
#SBATCH --ntasks-per-node=4         # number of MPI task per node
#SBATCH --gres=gpu:4                # number of GPUs per node
#SBATCH --cpus-per-task=10          # since nodes have 40 cpus, we tell slurm to allocate 1/4 of the node resources (i.e. 10)
#SBATCH --distribution=block:block  # distribution, might be better to have contiguous blocks
#SBATCH --time=00:01:00             # job length
#SBATCH --output=mnist_tf_mpi_log_%j.out  # std out
#SBATCH --error=mnist_tf_mpi_log_%j.out   # std err
#SBATCH --exclusive                 # we reserve the entire node four our job
#SBATCH -A changeme@gpu

cd ${SLURM_SUBMIT_DIR}

# Modules (03/2020)
module purge
module load cudnn/10.1-v7.5.1.10
module load nccl/2.4.2-1+cuda10.1
module load tensorflow-gpu/py3/1.14-openmpi

# Show comands
set -x

# Execution
srun --mpi=pmix python tf_mpi_mnist.py --mnist $PWD/mnist.npz