DifferentiableUniverseInitiative · EiffL · May 27, 2021 · May 27, 2021 · May 27, 2021 · May 27, 2021
diff --git a/examples/job_test_horovod.job b/examples/job_test_horovod.job
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name=test_horovod      # nom du job
+##SBATCH --partition=gpu_p2          # de-commente pour la partition gpu_p2
+#SBATCH --ntasks=4                   # nombre total de tache MPI (= nombre total de GPU)
+#SBATCH --ntasks-per-node=4          # nombre de tache MPI par noeud (= nombre de GPU par noeud)
+#SBATCH --gres=gpu:4                 # nombre de GPU par nœud (max 8 avec gpu_p2)
+#SBATCH --cpus-per-task=10           # nombre de coeurs CPU par tache (un quart du noeud ici)
+##SBATCH --cpus-per-task=3           # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
+# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
+#SBATCH --hint=nomultithread         # hyperthreading desactive
+#SBATCH --time=00:10:00              # temps d'execution maximum demande (HH:MM:SS)
+#SBATCH --output=test_horovod%j.out  # nom du fichier de sortie
+#SBATCH --error=test_horovod%j.err   # nom du fichier d'erreur (ici commun avec la sortie)
+#SBATCH -A ftb@gpu                   # specify the project
+#SBATCH --qos=qos_gpu-dev            # using the dev queue, as this is only for profiling
+
+# nettoyage des modules charges en interactif et herites par defaut
+module purge
+
+# chargement des modules
+module load tensorflow-gpu/py3/2.4.1+nccl-2.8.3-1
+
+# echo des commandes lancees
+set -x
+
+srun --unbuffered --mpi=pmi2 /gpfslocalsup/pub/idrtools/bind_gpu.sh python -u test_horovod.py
diff --git a/examples/tob_examples/demo.py b/examples/tob_examples/demo.py
@@ -0,0 +1,99 @@
+import numpy as np
+import os
+import math
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+import mesh_tensorflow as mtf
+from mesh_tensorflow.hvd_simd_mesh_impl import HvdSimdMeshImpl
+
+tf.flags.DEFINE_integer("gpus_per_node", 4, "Number of GPU on each node")
+tf.flags.DEFINE_integer("gpus_per_task", 4, "Number of GPU in each task")
+tf.flags.DEFINE_integer("tasks_per_node", 1, "Number of task in each node")
+tf.flags.DEFINE_integer("nx", 2, "number of slices along x dim")
+tf.flags.DEFINE_integer("ny", 2, "number of slices along x dim")
+tf.flags.DEFINE_integer("nz", 1, "number of slices along z dim")
+tf.flags.DEFINE_integer("nc", 128, "Size of data cube")
+tf.flags.DEFINE_integer("batch_size", 1, "Batch Size")
+FLAGS = tf.flags.FLAGS
+
+def model_fn(nc=64, batch_size=1):
+    """
+    Example of function implementing a CNN and returning a value.
+    """
+    # Create the mesh TF graph
+    graph = mtf.Graph()
+    mesh = mtf.Mesh(graph, "my_mesh")
+    # Define the named dimensions
+    n_block_x = FLAGS.nx #2
+    n_block_y = FLAGS.ny #2
+    n_block_z = FLAGS.nz #1
+    batch_dim = mtf.Dimension("batch", batch_size)
+    nx_dim = mtf.Dimension('nx_block', n_block_x)
+    ny_dim = mtf.Dimension('ny_block', n_block_y)
+    nz_dim = mtf.Dimension('nz_block', n_block_z)
+    sx_dim = mtf.Dimension('sx_block', nc//n_block_x)
+    sy_dim = mtf.Dimension('sy_block', nc//n_block_y)
+    sz_dim = mtf.Dimension('sz_block', nc//n_block_z)
+    image_c_dim = mtf.Dimension('image_c', 3)
+    hidden_dim  = mtf.Dimension('h', 128)
+    # Create some input data
+    data = mtf.random_uniform(mesh, [batch_dim, nx_dim, ny_dim, nz_dim,
+                                                sx_dim, sy_dim, sz_dim,
+                                                image_c_dim])
+    net = mtf.layers.conv3d_with_blocks(data, hidden_dim,
+        filter_size=(3, 3, 3), strides=(1, 1, 1), padding='SAME',
+        d_blocks_dim=nx_dim, h_blocks_dim=ny_dim)
+    net = mtf.reduce_sum(net, output_shape=[batch_dim, hidden_dim] )
+    return net
+
+
+def main(_):
+    """
+    num_tasks = int(os.environ['SLURM_NTASKS'])
+    print('num_tasks : ', num_tasks)
+    # Resolve the cluster from SLURM environment
+    cluster = tf.distribute.cluster_resolver.SlurmClusterResolver({"mesh": num_tasks},
+                                                                port_base=8822,
+                                                                gpus_per_node=FLAGS.gpus_per_node,
+                                                                gpus_per_task=FLAGS.gpus_per_task,
+                                                                tasks_per_node=FLAGS.tasks_per_node)
+    cluster_spec = cluster.cluster_spec()
+    print(cluster_spec)
+    # Create a server for all mesh members
+    server = tf.distribute.Server(cluster_spec, "mesh", cluster.task_id)
+    print(server)
+    if cluster.task_id >0:
+      server.join()
+    # Otherwise we are the main task, let's define the devices
+    devices = ["/job:mesh/task:%d/device:GPU:%d"%(i,j) for i in range(cluster_spec.num_tasks("mesh")) for j in range(FLAGS.gpus_per_task)]
+    print("List of devices", devices)
+    """
+    # Defines the mesh structure
+    mesh_shape = [("row", FLAGS.nx), ("col", FLAGS.ny)]
+    layout_rules = [("nx_block","row"), ("ny_block","col")]
+    #mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(mesh_shape, layout_rules, devices)
+    mesh_impl = HvdSimdMeshImpl(mtf.convert_to_shape(mesh_shape), 
+                                mtf.convert_to_layout_rules(layout_rules))
+    # Create computational graphs
+    net  = model_fn(nc=FLAGS.nc, batch_size=FLAGS.batch_size)
+    # Lower mesh computation
+    graph = net.graph
+    mesh = net.mesh
+    lowering = mtf.Lowering(graph, {mesh:mesh_impl})
+    # Retrieve output of computation
+    result = lowering.export_to_tf_tensor(net)
+    # Perform some last processing in normal tensorflow
+    out = tf.reduce_mean(result)
+
+    print('Im about to enter the session..')
+    with tf.Session() as sess:
+        r = sess.run(out)
+    print("output of computation", r)
+    exit(0)
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
+
+
+
diff --git a/examples/tob_examples/job_demo.job b/examples/tob_examples/job_demo.job
@@ -0,0 +1,35 @@
+#!/bin/bash
+#SBATCH --reservation=hackathon_idr27
+#SBATCH --job-name=demo_variable     # nom du job
+##SBATCH --partition=gpu_p2          # de-commente pour la partition gpu_p2
+#SBATCH --ntasks=4                   # nombre total de tache MPI (= nombre total de GPU)
+#SBATCH --ntasks-per-node=4          # nombre de tache MPI par noeud (= nombre de GPU par noeud)
+#SBATCH --gres=gpu:4                 # nombre de GPU par nœud (max 8 avec gpu_p2)
+#SBATCH --cpus-per-task=10           # nombre de coeurs CPU par tache (un quart du noeud ici)
+##SBATCH --cpus-per-task=3           # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
+# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
+#SBATCH --hint=nomultithread         # hyperthreading desactive
+#SBATCH --time=00:10:00              # temps d'execution maximum demande (HH:MM:SS)
+#SBATCH --output=var_demo_test%j.out     # nom du fichier de sortie
+#SBATCH --error=var_demo_test%j.err      # nom du fichier d'erreur (ici commun avec la sortie)
+#SBATCH -A ftb@gpu                   # specify the project
+#SBATCH --qos=qos_gpu-dev            # using the dev queue, as this is only for profiling
+
+# nettoyage des modules charges en interactif et herites par defaut
+module purge
+
+# chargement des modules
+module load tensorflow-gpu/py3/2.4.1+nccl-2.8.3-1
+
+# echo des commandes lancees
+set -x
+
+# JZ FIX
+#export TMPDIR=$JOBSCRATCH
+#ln -s $JOBSCRATCH /tmp/nvidia
+# execution du code avec binding via bind_gpu.sh : 1 GPU pour 1 tache MPI.
+
+srun --unbuffered --mpi=pmi2 /gpfslocalsup/pub/idrtools/bind_gpu.sh  python -u demo.py --nc=128 --batch_size=1
+
+
+
diff --git a/examples/tob_examples/job_optim_demo.job b/examples/tob_examples/job_optim_demo.job
@@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH --job-name=demo_optim_var    # nom du job
+##SBATCH --partition=gpu_p2          # de-commente pour la partition gpu_p2
+#SBATCH --ntasks=4                   # nombre total de tache MPI (= nombre total de GPU)
+#SBATCH --ntasks-per-node=4          # nombre de tache MPI par noeud (= nombre de GPU par noeud)
+#SBATCH --gres=gpu:4                 # nombre de GPU par nœud (max 8 avec gpu_p2)
+#SBATCH --cpus-per-task=10           # nombre de coeurs CPU par tache (un quart du noeud ici)
+##SBATCH --cpus-per-task=3           # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
+# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
+#SBATCH --hint=nomultithread         # hyperthreading desactive
+#SBATCH --time=00:10:00              # temps d'execution maximum demande (HH:MM:SS)
+#SBATCH --output=var_optim_demo_test%j.out     # nom du fichier de sortie
+#SBATCH --error=var_optim_demo_test%j.err      # nom du fichier d'erreur (ici commun avec la sortie)
+#SBATCH -A ftb@gpu                   # specify the project
+#SBATCH --qos=qos_gpu-dev            # using the dev queue, as this is only for profiling
+
+# nettoyage des modules charges en interactif et herites par defaut
+module purge
+
+# chargement des modules
+module load tensorflow-gpu/py3/2.4.1+cuda-11.2
+
+# echo des commandes lancees
+set -x
+
+# execution du code avec binding via bind_gpu.sh : 1 GPU pour 1 tache MPI.
+srun --unbuffered --mpi=pmi2 -o log_%t.txt /gpfslocalsup/pub/idrtools/bind_gpu.sh  python -u optim_demo.py --nc=8 --batch_size=1
+
+
+
diff --git a/examples/tob_examples/job_simple_demo.job b/examples/tob_examples/job_simple_demo.job
@@ -0,0 +1,35 @@
+#!/bin/bash
+#SBATCH --reservation=hackathon_idr27
+#SBATCH --job-name=demo_simple_var    # nom du job
+##SBATCH --partition=gpu_p2          # de-commente pour la partition gpu_p2
+#SBATCH --ntasks=4                   # nombre total de tache MPI (= nombre total de GPU)
+#SBATCH --ntasks-per-node=4          # nombre de tache MPI par noeud (= nombre de GPU par noeud)
+#SBATCH --gres=gpu:4                 # nombre de GPU par nœud (max 8 avec gpu_p2)
+#SBATCH --cpus-per-task=10           # nombre de coeurs CPU par tache (un quart du noeud ici)
+##SBATCH --cpus-per-task=3           # nombre de coeurs CPU par tache (pour gpu_p2 : 1/8 du noeud)
+# /!\ Attention, "multithread" fait reference a l'hyperthreading dans la terminologie Slurm
+#SBATCH --hint=nomultithread         # hyperthreading desactive
+#SBATCH --time=00:10:00              # temps d'execution maximum demande (HH:MM:SS)
+#SBATCH --output=var_simple_demo_test%j.out     # nom du fichier de sortie
+#SBATCH --error=var_simple_demo_test%j.err      # nom du fichier d'erreur (ici commun avec la sortie)
+#SBATCH -A ftb@gpu                   # specify the project
+#SBATCH --qos=qos_gpu-dev            # using the dev queue, as this is only for profiling
+
+# nettoyage des modules charges en interactif et herites par defaut
+module purge
+
+# chargement des modules
+module load tensorflow-gpu/py3/2.4.1+nccl-2.8.3-1
+
+# echo des commandes lancees
+set -x
+
+# JZ FIX
+#export TMPDIR=$JOBSCRATCH
+#ln -s $JOBSCRATCH /tmp/nvidia
+# execution du code avec binding via bind_gpu.sh : 1 GPU pour 1 tache MPI.
+
+srun --unbuffered --mpi=pmi2 /gpfslocalsup/pub/idrtools/bind_gpu.sh  python -u simple_demo.py --nc=8 --batch_size=1
+
+
+
diff --git a/examples/tob_examples/optim_demo.py b/examples/tob_examples/optim_demo.py
@@ -0,0 +1,113 @@
+import numpy as np
+import os
+import math
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+import mesh_tensorflow as mtf
+from mesh_tensorflow.hvd_simd_mesh_impl import HvdSimdMeshImpl
+
+tf.flags.DEFINE_integer("gpus_per_node", 4, "Number of GPU on each node")
+tf.flags.DEFINE_integer("gpus_per_task", 4, "Number of GPU in each task")
+tf.flags.DEFINE_integer("tasks_per_node", 1, "Number of task in each node")
+tf.flags.DEFINE_integer("nx", 2, "number of slices along x dim")
+tf.flags.DEFINE_integer("ny", 2, "number of slices along x dim")
+tf.flags.DEFINE_integer("nz", 1, "number of slices along z dim")
+tf.flags.DEFINE_integer("nc", 8, "Size of data cube")
+tf.flags.DEFINE_integer("batch_size", 1, "Batch Size")
+FLAGS = tf.flags.FLAGS
+
+def new_model_fn():
+
+
+    graph = mtf.Graph()
+    mesh = mtf.Mesh(graph, "my_mesh")
+
+    # Define dimensions
+    batch_dim = mtf.Dimension("batch", FLAGS.batch_size)
+
+    nc_x_dim = mtf.Dimension('nc_x_dim', FLAGS.nc)
+    nc_y_dim = mtf.Dimension('nc_y_dim', FLAGS.nc)
+    nc_z_dim = mtf.Dimension('nc_z_dim', 1)
+
+    hidden_dim_int = 4
+    hidden_dim  = mtf.Dimension('h', hidden_dim_int)
+
+    # Define the input
+    # mtf_input = mtf.import_tf_tensor(mesh, im_input, shape=mtf.Shape([batch_dim, nc_x_dim, nc_y_dim, nc_z_dim])) 
+
+    mtf_input = mtf.random_uniform(mesh, [batch_dim, nc_x_dim, nc_y_dim, nc_z_dim])
+
+    # Define the network
+    net_out = mtf.layers.dense(mtf_input, hidden_dim) 
+    net_out = mtf.reduce_sum(net_out, output_shape=[batch_dim, hidden_dim])
+
+    # Define the loss
+    one_tensor = mtf.import_tf_tensor(
+            mesh,
+            tf.ones([FLAGS.batch_size, hidden_dim_int]),
+            shape=mtf.Shape([batch_dim, hidden_dim]))
+
+    loss = mtf.reduce_sum(mtf.square(net_out - one_tensor))
+    # loss = mtf.reduce_sum(mtf.square(net_out))
+
+    # return net_out, loss
+    return loss
+
+
+def main(_):
+    # NEW
+    global_step = tf.train.get_global_step()
+
+    # Defines the mesh structure
+    mesh_shape = [("row", FLAGS.nx), ("col", FLAGS.ny)]
+    # layout_rules = [("nx_block","row"), ("ny_block","col")]
+    layout_rules = [("nc_x_dim","row"), ("nc_y_dim","col")]
+
+    #mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(mesh_shape, layout_rules, devices)
+    mesh_impl = HvdSimdMeshImpl(mtf.convert_to_shape(mesh_shape), 
+                                mtf.convert_to_layout_rules(layout_rules))
+
+
+    # Create computational graphs
+    net  = new_model_fn()
+    # Lower mesh computation
+    graph = net.graph
+    mesh = net.mesh
+    # Retrieve output of computation
+    # result = lowering.export_to_tf_tensor(net)
+
+    var_grads = mtf.gradients(
+        [net], [v.outputs[0] for v in graph.trainable_variables])
+    optimizer = mtf.optimize.AdafactorOptimizer()
+    update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables)
+
+    print('update_ops: ',update_ops)
+
+    lowering = mtf.Lowering(graph, {mesh:mesh_impl})
+    # Perform some last processing in normal tensorflow
+    result = lowering.export_to_tf_tensor(net)
+    out = tf.reduce_mean(result)
+
+    tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
+    train_op = tf.group(tf_update_ops)
+
+
+    print('Im about to enter the session..')
+    print('before session: ', tf.global_variables())
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        for i in range(10):
+            r, _ = sess.run([out, train_op])
+            print('iter', i, r)
+            print(sess.run(tf.global_variables()))
+        # r = sess.run(out)
+
+    print("output of computation", r)
+    exit(0)
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
+
+
+