Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for LaidOutVariables #9

Open
wants to merge 7 commits into
base: max_hvd
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions examples/job_test_horovod.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
#SBATCH --job-name=test_horovod # nom du job
##SBATCH --partition=gpu_p2 # de-commente pour la partition gpu_p2
#SBATCH --ntasks=4 # nombre total de tache MPI (= nombre total de GPU)
#SBATCH --ntasks-per-node=4 # nombre de tache MPI par noeud (= nombre de GPU par noeud)
#SBATCH --gres=gpu:4 # nombre de GPU par nœud (max 8 avec gpu_p2)
#SBATCH --cpus-per-task=10 # nombre de coeurs CPU par tache (un quart du noeud ici)
##SBATCH --cpus-per-task=3 # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
#SBATCH --hint=nomultithread # hyperthreading desactive
#SBATCH --time=00:10:00 # temps d'execution maximum demande (HH:MM:SS)
#SBATCH --output=test_horovod%j.out # nom du fichier de sortie
#SBATCH --error=test_horovod%j.err # nom du fichier d'erreur (ici commun avec la sortie)
#SBATCH -A ftb@gpu # specify the project
#SBATCH --qos=qos_gpu-dev # using the dev queue, as this is only for profiling

# nettoyage des modules charges en interactif et herites par defaut
module purge

# chargement des modules
module load tensorflow-gpu/py3/2.4.1+nccl-2.8.3-1

# echo des commandes lancees
set -x

srun --unbuffered --mpi=pmi2 /gpfslocalsup/pub/idrtools/bind_gpu.sh python -u test_horovod.py
99 changes: 99 additions & 0 deletions examples/tob_examples/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np
import os
import math
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import mesh_tensorflow as mtf
from mesh_tensorflow.hvd_simd_mesh_impl import HvdSimdMeshImpl

tf.flags.DEFINE_integer("gpus_per_node", 4, "Number of GPU on each node")
tf.flags.DEFINE_integer("gpus_per_task", 4, "Number of GPU in each task")
tf.flags.DEFINE_integer("tasks_per_node", 1, "Number of task in each node")
tf.flags.DEFINE_integer("nx", 2, "number of slices along x dim")
tf.flags.DEFINE_integer("ny", 2, "number of slices along x dim")
tf.flags.DEFINE_integer("nz", 1, "number of slices along z dim")
tf.flags.DEFINE_integer("nc", 128, "Size of data cube")
tf.flags.DEFINE_integer("batch_size", 1, "Batch Size")
FLAGS = tf.flags.FLAGS

def model_fn(nc=64, batch_size=1):
"""
Example of function implementing a CNN and returning a value.
"""
# Create the mesh TF graph
graph = mtf.Graph()
mesh = mtf.Mesh(graph, "my_mesh")
# Define the named dimensions
n_block_x = FLAGS.nx #2
n_block_y = FLAGS.ny #2
n_block_z = FLAGS.nz #1
batch_dim = mtf.Dimension("batch", batch_size)
nx_dim = mtf.Dimension('nx_block', n_block_x)
ny_dim = mtf.Dimension('ny_block', n_block_y)
nz_dim = mtf.Dimension('nz_block', n_block_z)
sx_dim = mtf.Dimension('sx_block', nc//n_block_x)
sy_dim = mtf.Dimension('sy_block', nc//n_block_y)
sz_dim = mtf.Dimension('sz_block', nc//n_block_z)
image_c_dim = mtf.Dimension('image_c', 3)
hidden_dim = mtf.Dimension('h', 128)
# Create some input data
data = mtf.random_uniform(mesh, [batch_dim, nx_dim, ny_dim, nz_dim,
sx_dim, sy_dim, sz_dim,
image_c_dim])
net = mtf.layers.conv3d_with_blocks(data, hidden_dim,
filter_size=(3, 3, 3), strides=(1, 1, 1), padding='SAME',
d_blocks_dim=nx_dim, h_blocks_dim=ny_dim)
net = mtf.reduce_sum(net, output_shape=[batch_dim, hidden_dim] )
return net


def main(_):
"""
num_tasks = int(os.environ['SLURM_NTASKS'])
print('num_tasks : ', num_tasks)
# Resolve the cluster from SLURM environment
cluster = tf.distribute.cluster_resolver.SlurmClusterResolver({"mesh": num_tasks},
port_base=8822,
gpus_per_node=FLAGS.gpus_per_node,
gpus_per_task=FLAGS.gpus_per_task,
tasks_per_node=FLAGS.tasks_per_node)
cluster_spec = cluster.cluster_spec()
print(cluster_spec)
# Create a server for all mesh members
server = tf.distribute.Server(cluster_spec, "mesh", cluster.task_id)
print(server)
if cluster.task_id >0:
server.join()
# Otherwise we are the main task, let's define the devices
devices = ["/job:mesh/task:%d/device:GPU:%d"%(i,j) for i in range(cluster_spec.num_tasks("mesh")) for j in range(FLAGS.gpus_per_task)]
print("List of devices", devices)
"""
# Defines the mesh structure
mesh_shape = [("row", FLAGS.nx), ("col", FLAGS.ny)]
layout_rules = [("nx_block","row"), ("ny_block","col")]
#mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(mesh_shape, layout_rules, devices)
mesh_impl = HvdSimdMeshImpl(mtf.convert_to_shape(mesh_shape),
mtf.convert_to_layout_rules(layout_rules))
# Create computational graphs
net = model_fn(nc=FLAGS.nc, batch_size=FLAGS.batch_size)
# Lower mesh computation
graph = net.graph
mesh = net.mesh
lowering = mtf.Lowering(graph, {mesh:mesh_impl})
# Retrieve output of computation
result = lowering.export_to_tf_tensor(net)
# Perform some last processing in normal tensorflow
out = tf.reduce_mean(result)

print('Im about to enter the session..')
with tf.Session() as sess:
r = sess.run(out)
print("output of computation", r)
exit(0)


if __name__ == "__main__":
tf.app.run(main=main)



35 changes: 35 additions & 0 deletions examples/tob_examples/job_demo.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
#SBATCH --reservation=hackathon_idr27
#SBATCH --job-name=demo_variable # nom du job
##SBATCH --partition=gpu_p2 # de-commente pour la partition gpu_p2
#SBATCH --ntasks=4 # nombre total de tache MPI (= nombre total de GPU)
#SBATCH --ntasks-per-node=4 # nombre de tache MPI par noeud (= nombre de GPU par noeud)
#SBATCH --gres=gpu:4 # nombre de GPU par nœud (max 8 avec gpu_p2)
#SBATCH --cpus-per-task=10 # nombre de coeurs CPU par tache (un quart du noeud ici)
##SBATCH --cpus-per-task=3 # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
#SBATCH --hint=nomultithread # hyperthreading desactive
#SBATCH --time=00:10:00 # temps d'execution maximum demande (HH:MM:SS)
#SBATCH --output=var_demo_test%j.out # nom du fichier de sortie
#SBATCH --error=var_demo_test%j.err # nom du fichier d'erreur (ici commun avec la sortie)
#SBATCH -A ftb@gpu # specify the project
#SBATCH --qos=qos_gpu-dev # using the dev queue, as this is only for profiling

# nettoyage des modules charges en interactif et herites par defaut
module purge

# chargement des modules
module load tensorflow-gpu/py3/2.4.1+nccl-2.8.3-1

# echo des commandes lancees
set -x

# JZ FIX
#export TMPDIR=$JOBSCRATCH
#ln -s $JOBSCRATCH /tmp/nvidia
# execution du code avec binding via bind_gpu.sh : 1 GPU pour 1 tache MPI.

srun --unbuffered --mpi=pmi2 /gpfslocalsup/pub/idrtools/bind_gpu.sh python -u demo.py --nc=128 --batch_size=1



30 changes: 30 additions & 0 deletions examples/tob_examples/job_optim_demo.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
#SBATCH --job-name=demo_optim_var # nom du job
##SBATCH --partition=gpu_p2 # de-commente pour la partition gpu_p2
#SBATCH --ntasks=4 # nombre total de tache MPI (= nombre total de GPU)
#SBATCH --ntasks-per-node=4 # nombre de tache MPI par noeud (= nombre de GPU par noeud)
#SBATCH --gres=gpu:4 # nombre de GPU par nœud (max 8 avec gpu_p2)
#SBATCH --cpus-per-task=10 # nombre de coeurs CPU par tache (un quart du noeud ici)
##SBATCH --cpus-per-task=3 # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
#SBATCH --hint=nomultithread # hyperthreading desactive
#SBATCH --time=00:10:00 # temps d'execution maximum demande (HH:MM:SS)
#SBATCH --output=var_optim_demo_test%j.out # nom du fichier de sortie
#SBATCH --error=var_optim_demo_test%j.err # nom du fichier d'erreur (ici commun avec la sortie)
#SBATCH -A ftb@gpu # specify the project
#SBATCH --qos=qos_gpu-dev # using the dev queue, as this is only for profiling

# nettoyage des modules charges en interactif et herites par defaut
module purge

# chargement des modules
module load tensorflow-gpu/py3/2.4.1+cuda-11.2

# echo des commandes lancees
set -x

# execution du code avec binding via bind_gpu.sh : 1 GPU pour 1 tache MPI.
srun --unbuffered --mpi=pmi2 -o log_%t.txt /gpfslocalsup/pub/idrtools/bind_gpu.sh python -u optim_demo.py --nc=8 --batch_size=1



35 changes: 35 additions & 0 deletions examples/tob_examples/job_simple_demo.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
#SBATCH --reservation=hackathon_idr27
#SBATCH --job-name=demo_simple_var # nom du job
##SBATCH --partition=gpu_p2 # de-commente pour la partition gpu_p2
#SBATCH --ntasks=4 # nombre total de tache MPI (= nombre total de GPU)
#SBATCH --ntasks-per-node=4 # nombre de tache MPI par noeud (= nombre de GPU par noeud)
#SBATCH --gres=gpu:4 # nombre de GPU par nœud (max 8 avec gpu_p2)
#SBATCH --cpus-per-task=10 # nombre de coeurs CPU par tache (un quart du noeud ici)
##SBATCH --cpus-per-task=3 # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
#SBATCH --hint=nomultithread # hyperthreading desactive
#SBATCH --time=00:10:00 # temps d'execution maximum demande (HH:MM:SS)
#SBATCH --output=var_simple_demo_test%j.out # nom du fichier de sortie
#SBATCH --error=var_simple_demo_test%j.err # nom du fichier d'erreur (ici commun avec la sortie)
#SBATCH -A ftb@gpu # specify the project
#SBATCH --qos=qos_gpu-dev # using the dev queue, as this is only for profiling

# nettoyage des modules charges en interactif et herites par defaut
module purge

# chargement des modules
module load tensorflow-gpu/py3/2.4.1+nccl-2.8.3-1

# echo des commandes lancees
set -x

# JZ FIX
#export TMPDIR=$JOBSCRATCH
#ln -s $JOBSCRATCH /tmp/nvidia
# execution du code avec binding via bind_gpu.sh : 1 GPU pour 1 tache MPI.

srun --unbuffered --mpi=pmi2 /gpfslocalsup/pub/idrtools/bind_gpu.sh python -u simple_demo.py --nc=8 --batch_size=1



113 changes: 113 additions & 0 deletions examples/tob_examples/optim_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import numpy as np
import os
import math
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import mesh_tensorflow as mtf
from mesh_tensorflow.hvd_simd_mesh_impl import HvdSimdMeshImpl

tf.flags.DEFINE_integer("gpus_per_node", 4, "Number of GPU on each node")
tf.flags.DEFINE_integer("gpus_per_task", 4, "Number of GPU in each task")
tf.flags.DEFINE_integer("tasks_per_node", 1, "Number of task in each node")
tf.flags.DEFINE_integer("nx", 2, "number of slices along x dim")
tf.flags.DEFINE_integer("ny", 2, "number of slices along x dim")
tf.flags.DEFINE_integer("nz", 1, "number of slices along z dim")
tf.flags.DEFINE_integer("nc", 8, "Size of data cube")
tf.flags.DEFINE_integer("batch_size", 1, "Batch Size")
FLAGS = tf.flags.FLAGS

def new_model_fn():


graph = mtf.Graph()
mesh = mtf.Mesh(graph, "my_mesh")

# Define dimensions
batch_dim = mtf.Dimension("batch", FLAGS.batch_size)

nc_x_dim = mtf.Dimension('nc_x_dim', FLAGS.nc)
nc_y_dim = mtf.Dimension('nc_y_dim', FLAGS.nc)
nc_z_dim = mtf.Dimension('nc_z_dim', 1)

hidden_dim_int = 4
hidden_dim = mtf.Dimension('h', hidden_dim_int)

# Define the input
# mtf_input = mtf.import_tf_tensor(mesh, im_input, shape=mtf.Shape([batch_dim, nc_x_dim, nc_y_dim, nc_z_dim]))

mtf_input = mtf.random_uniform(mesh, [batch_dim, nc_x_dim, nc_y_dim, nc_z_dim])

# Define the network
net_out = mtf.layers.dense(mtf_input, hidden_dim)
net_out = mtf.reduce_sum(net_out, output_shape=[batch_dim, hidden_dim])

# Define the loss
one_tensor = mtf.import_tf_tensor(
mesh,
tf.ones([FLAGS.batch_size, hidden_dim_int]),
shape=mtf.Shape([batch_dim, hidden_dim]))

loss = mtf.reduce_sum(mtf.square(net_out - one_tensor))
# loss = mtf.reduce_sum(mtf.square(net_out))

# return net_out, loss
return loss


def main(_):
# NEW
global_step = tf.train.get_global_step()

# Defines the mesh structure
mesh_shape = [("row", FLAGS.nx), ("col", FLAGS.ny)]
# layout_rules = [("nx_block","row"), ("ny_block","col")]
layout_rules = [("nc_x_dim","row"), ("nc_y_dim","col")]

#mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(mesh_shape, layout_rules, devices)
mesh_impl = HvdSimdMeshImpl(mtf.convert_to_shape(mesh_shape),
mtf.convert_to_layout_rules(layout_rules))


# Create computational graphs
net = new_model_fn()
# Lower mesh computation
graph = net.graph
mesh = net.mesh
# Retrieve output of computation
# result = lowering.export_to_tf_tensor(net)

var_grads = mtf.gradients(
[net], [v.outputs[0] for v in graph.trainable_variables])
optimizer = mtf.optimize.AdafactorOptimizer()
update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables)

print('update_ops: ',update_ops)

lowering = mtf.Lowering(graph, {mesh:mesh_impl})
# Perform some last processing in normal tensorflow
result = lowering.export_to_tf_tensor(net)
out = tf.reduce_mean(result)

tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
train_op = tf.group(tf_update_ops)


print('Im about to enter the session..')
print('before session: ', tf.global_variables())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(10):
r, _ = sess.run([out, train_op])
print('iter', i, r)
print(sess.run(tf.global_variables()))
# r = sess.run(out)

print("output of computation", r)
exit(0)


if __name__ == "__main__":
tf.app.run(main=main)



Loading