TODO
- when all apps working, build with oras and containers pre-pulled
- kripke missing params for most
- Is redundantly saving results OK (they are saved to same results directory, pushed to different artifacts)
We need to pull all containers to all nodes, as there is no shared filesystem. We will pull to /home/ubuntu
since it exists.
cd /home/ubuntu
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-amg2023:spack-slim-cpu-int64-zen3
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-laghos:libfabric-cpu-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-kripke-cpu:libfabric-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-single-node:cpu-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-minife:libfabric-cpu-zen4
#flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-lammps-cpu:zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-mixbench:libfabric-cpu-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/mt-gemm:libfabric-cpu-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-osu-cpu:libfabric-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-quicksilver-cpu:libfabric-zen4
flux exec --dir /home/ubuntu -r all singularity pull docker://ghcr.io/converged-computing/metric-stream:libfabric-cpu-zen4
If we need to install oras:
#!/bin/bash
cd /tmp
export VERSION="1.1.0"
curl -LO "https://github.com/oras-project/oras/releases/download/v${VERSION}/oras_${VERSION}_linux_amd64.tar.gz"
mkdir -p oras-install/ && \
tar -zxf oras_${VERSION}_*.tar.gz -C oras-install/
sudo mv oras-install/oras /usr/local/bin/
rm -rf oras_${VERSION}_*.tar.gz oras-install/
chmod +x install_oras.sh
flux archive create --name oras -C /home/ubuntu install_oras.sh
flux exec -x 0 flux archive extract --name oras -C /home/ubuntu
flux run -N $nodes /bin/bash /home/ubuntu/install_oras.sh
Get the topology:
aws ec2 describe-instance-topology --region us-east-2 --filters Name=instance-type,Values=hpc6a.48xlarge > topology-2.json
aws ec2 describe-instances --filters "Name=instance-type,Values=hpc6a.48xlarge" --region us-east-1 > instances-2.json
Sanity check efa is there.
# fi_info | less
provider: efa
fabric: efa
domain: rdmap0s31-rdm
version: 121.0
type: FI_EP_RDM
protocol: FI_PROTO_EFA
provider: efa
fabric: efa
domain: rdmap0s31-dgrm
version: 121.0
type: FI_EP_DGRAM
protocol: FI_PROTO_EFA
Write this script to file that we can run incrementally to save output (that does not exist yet)
#!/bin/bash
output=$1
# When they are done:
for jobid in $(flux jobs -a --json | jq -r .jobs[].id)
do
# Get the job study id
study_id=$(flux job info $jobid jobspec | jq -r ".attributes.user.study_id")
if [[ -f "$output/${study_id}-${jobid}.out" ]] || [[ "$study_id" == "null" ]]; then
continue
fi
echo "Parsing jobid ${jobid} and study id ${study_id}"
flux job attach $jobid &> $output/${study_id}-${jobid}.out
echo "START OF JOBSPEC" >> $output/${study_id}-${jobid}.out
flux job info $jobid jobspec >> $output/${study_id}-${jobid}.out
echo "START OF EVENTLOG" >> $output/${study_id}-${jobid}.out
flux job info $jobid guest.exec.eventlog >> $output/${study_id}-${jobid}.out
done
mkdir -p ./results
chmod +x ./save.sh
# This output directory is used across experiments
export output=./results
mkdir -p $output
We are going to run this via flux, running the job across nodes (and then when they are complete, getting the logs from flux). Here is a modified entrypoint:
oras login ghcr.io --username vsoch
app=single-node
nodes=3
mkdir -p $output
for node in $(seq 1 $nodes); do
flux submit -N1 --setattr=user.study_id=$app-node-$node singularity exec metric-single-node_cpu-zen4.sif /bin/bash /entrypoint.sh
done
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
Clean up test files
flux exec -r all /bin/bash -c "rm -rf /home/ubuntu/test_file*"
This one requires sourcing spack, so we need to write a little wrapper for it.
#!/bin/bash
# run_amg.sh
. /etc/profile.d/z10_spack_environment.sh
$@
chmod +x run_amg.sh
flux archive create --name amg -C /home/ubuntu run_amg.sh
flux exec -x 0 flux archive extract --name amg -C /home/ubuntu
Test size run:
# 3 seconds
time flux run --env OMP_NUM_THREADS=3 -N 2 -n 8 -opmi=pmix -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-amg2023_spack-slim-cpu.sif /bin/bash /home/ubuntu/run_amg.sh amg -n 32 32 32 -P 2 2 2 -problem 2
oras login ghcr.io --username vsoch
app=amg2023
for i in $(seq 1 15); do
echo "Running iteration $i"
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-32-iter-$i -N 32 -n 1024 -opmi=pmix -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-amg2023_spack-slim-cpu.sif /bin/bash /home/ubuntu/run_amg.sh amg -n 256 256 128 -P 16 8 8 -problem 2
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-64-iter-$i -N 64 -n 2048 -opmi=pmix -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-amg2023_spack-slim-cpu.sif /bin/bash /home/ubuntu/run_amg.sh amg -n 256 256 128 -P 16 16 8 -problem 2
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-128-iter-$i -N 128 -n 4096 -opmi=pmix -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-amg2023_spack-slim-cpu.sif /bin/bash /home/ubuntu/run_amg.sh amg -n 256 256 128 -P 16 16 16 -problem 2
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-256-iter-$i -N 256 -n 8192 -opmi=pmix -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-amg2023_spack-slim-cpu.sif /bin/bash /home/ubuntu/run_amg.sh amg -n 256 256 128 -P 32 16 16 -problem 2
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
IMPORTANT: not done yet, we skipped it, so I skipped testing, but added the container name
time flux run --env OMP_NUM_THREADS=1 -N 1 -n 96 singularity exec /home/ubuntu/metric-kripke-cpu_libfabric-zen4.sif kripke
time flux run --env OMP_NUM_THREADS=3 -N 2 -n 64 singularity exec /home/ubuntu/metric-kripke-cpu_libfabric-zen4.sif kripke --layout DGZ --dset 16 --zones 144,448,256 --gset 16 --groups 16 --niter 500 --legendre 2 --quad 16 --procs 12,16,16
oras login ghcr.io --username vsoch
app=kripke
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-32-iter-$i -N 32 -n 3072 singularity exec /home/ubuntu/metric-kripke-cpu_libfabric-zen4.sif kripke --layout DGZ --dset 16 --zones 144,448,256 --gset 16 --groups 16 --niter 500 --legendre 2 --quad 16 --procs 12,16,16
# NOT DONE YET
time flux run --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-64-iter-$i -N64 -n 2048 singularity exec /home/ubuntu/metric-kripke-cpu_libfabric-zen4.sif kripke --arch CUDA --layout GDZ --dset 8 --zones 128,128,128 --gset 16 --groups 64 --niter 50 --legendre 8 --quad 8 --procs 4,4,4
time flux run --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-128-iter-$i -N128 -n 4096 singularity exec /home/ubuntu/metric-kripke-cpu_libfabric-zen4.sif kripke --arch CUDA --layout GDZ --dset 8 --zones 128,128,128 --gset 16 --groups 64 --niter 50 --legendre 8 --quad 8 --procs 4,8,4
time flux run --env OMP_NUM_THREADS=1 --setattr=user.study_id=$app-256-iter-$i -N256 -n 8192 singularity exec /home/ubuntu/metric-kripke-cpu_libfabric-zen4.sif kripke --arch CUDA --layout GDZ --dset 8 --zones 128,128,128 --gset 16 --groups 64 --niter 50 --legendre 8 --quad 8 --procs 8,4,8
done
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
Testing:
# 1m 41 seconds
time flux run -o cpu-affinity=per-task -N3 -n 288 singularity exec /home/ubuntu/metric-laghos_libfabric-cpu-zen4.sif /opt/laghos/laghos -pa -p 1 -tf 0.6 -pt 311 -m /opt/laghos/data/cube_311_hex.mesh --ode-solver 7 --max-steps 10 --cg-tol 0 -cgm 50 -ok 3 -ot 2 -rs 4 -rp 2 --fom
time flux run -o cpu-affinity=per-task -N2 -n singularity exec /home/ubuntu/metric-laghos_libfabric-cpu-zen4.sif /opt/laghos/laghos -pa -p 1 -tf 0.6 -pt 311 -m /opt/laghos/data/cube_311_hex.mesh --ode-solver 7 --max-steps 10 --cg-tol 0 -cgm 50 -ok 3 -ot 2 -rs 4 -rp 2 --fom
oras login ghcr.io --username vsoch
app=laghos
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run -o cpu-affinity=per-task --setattr=user.study_id=$app-32-iter-$i -N32 -n 3072 singularity exec /home/ubuntu/metric-laghos_libfabric-cpu-zen4.sif /opt/laghos/laghos -pa -p 1 -tf 0.6 -pt 311 -m /opt/laghos/data/cube_311_hex.mesh --ode-solver 7 --max-steps 400 --cg-tol 0 -cgm 50 -ok 3 -ot 2 -rs 4 -rp 2 --fom
time flux run -o cpu-affinity=per-task --setattr=user.study_id=$app-64-iter-$i -N64 -n 6144 singularity exec /home/ubuntu/metric-laghos_libfabric-cpu-zen4.sif /opt/laghos/laghos -pa -p 1 -tf 0.6 -pt 311 -m /opt/laghos/data/cube_311_hex.mesh --ode-solver 7 --max-steps 400 --cg-tol 0 -cgm 50 -ok 3 -ot 2 -rs 4 -rp 2 --fom
time flux run -o cpu-affinity=per-task --setattr=user.study_id=$app-128-iter-$i -N128 -n 12288 1 singularity exec /home/ubuntu/metric-laghos_libfabric-cpu-zen4.sif /opt/laghos/laghos -pa -p 1 -tf 0.6 -pt 311 -m /opt/laghos/data/cube_311_hex.mesh --ode-solver 7 --max-steps 400 --cg-tol 0 -cgm 50 -ok 3 -ot 2 -rs 4 -rp 2 --fom
time flux run -o cpu-affinity=per-task --setattr=user.study_id=$app-256-iter-$i -N256 -n 24576 singularity exec /home/ubuntu/metric-laghos_libfabric-cpu-zen4.sif /opt/laghos/laghos -pa -p 1 -tf 0.6 -pt 311 -m /opt/laghos/data/cube_311_hex.mesh --ode-solver 7 --max-steps 400 --cg-tol 0 -cgm 50 -ok 3 -ot 2 -rs 4 -rp 2 --fom
done
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
cd /opt/containers
cd -
flux exec singularity pull docker://ghcr.io/converged-computing/metric-lammps-cpu:libfabric-zen4-reax
# 1 seconds wall time, 8 seconds real (hookup)
time flux run -o cpu-affinity=per-task -N3 -n 288 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4.sif lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 128 -v y 128 -v z 128 -var nsteps 2000
oras login ghcr.io --username vsoch
app=lammps-reax
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --setattr=user.study_id=$app-32-iter-$i -o cpu-affinity=per-task -N32 -n 3072 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4-reax.sif /usr/bin/lmp -v x 64 -v y 64 -v z 32 -in in.reaxff.hns -nocite
time flux run --setattr=user.study_id=$app-64-iter-$i -o cpu-affinity=per-task -N64 -n 6144 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4-reax.sif /usr/bin/lmp -v x 64 -v y 64 -v z 32 -in in.reaxff.hns -nocite
time flux run --setattr=user.study_id=$app-128-iter-$i -o cpu-affinity=per-task -N128 -n 12288 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4-reax.sif /usr/bin/lmp -v x 64 -v y 64 -v z 32 -in in.reaxff.hns -nocite
time flux run --setattr=user.study_id=$app-256-iter-$i -o cpu-affinity=per-task -N228 -n 24576 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4-reax.sif /usr/bin/lmp -v x 64 -v y 64 -v z 32 -in in.reaxff.hns -nocite
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
time flux run -o cpu-affinity=per-task -N2 -n 112 --env OMPI_MCA_btl_vader_single_copy_mechanism=none singularity exec --pwd /code /opt/containers/metric-lammps-cpu_rocky-8-reax.sif /usr/bin/lmp -v x 64 -v y 64 -v z 32 -in in.reaxff.hns -nocite
Do not use this one, does not scale well
# 1 seconds wall time, 8 seconds real (hookup)
time flux run -o cpu-affinity=per-task -N3 -n 288 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4.sif lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 128 -v y 128 -v z 128 -var nsteps 2000
oras login ghcr.io --username vsoch
app=lammps
# NOTE: the below takes 4 minutes. If taking too long, drop back to 3 iterations
# IMPORTANT: Ani is testing if 128 works on lassen and 1500 vs 1000 steps
# TODO THIS NEEDS TO BE THE UPDATED ONE
# time flux run --setattr=user.study_id=$app-32-iter-$i-20k -o cpu-affinity=per-task -N32 -n 3072 lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 512 -v y 512 -v z 512 -var nsteps 20000
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --setattr=user.study_id=$app-32-iter-$i -o cpu-affinity=per-task -N32 -n 3072 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4.sif lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 128 -v y 128 -v z 128 -var nsteps 1000
time flux run --setattr=user.study_id=$app-64-iter-$i -o cpu-affinity=per-task -N64 -n 6144 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4.sif lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 128 -v y 128 -v z 128 -var nsteps 1000
time flux run --setattr=user.study_id=$app-128-iter-$i -o cpu-affinity=per-task -N128 -n 12288 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4.sif lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 128 -v y 128 -v z 128 -var nsteps 1000
time flux run --setattr=user.study_id=$app-256-iter-$i -o cpu-affinity=per-task -N228 -n 24576 singularity exec --pwd /code /home/ubuntu/metric-lammps-cpu_zen4.sif lmp -k on -sf kk -pk kokkos newton on neigh half -in in.snap.test -var snapdir 2J8_W.SNAP -v x 128 -v y 128 -v z 128 -var nsteps 1000
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
# 8 seconds
time flux run -N3 -n 288 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-minife_libfabric-cpu-zen4.sif miniFE.x nx=230 ny=230 nz=230 use_locking=1 elem_group_size=10 use_elem_mat_fields=300 verify_solution=0
#
time flux run -N3 -n 288 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-minife_libfabric-cpu-zen4.sif miniFE.x nx=640 ny=640 nz=640 use_locking=1 elem_group_size=10 use_elem_mat_fields=300 verify_solution=0
oras login ghcr.io --username vsoch
app=minife
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --setattr=user.study_id=$app-32-iter-$i -N32 -n 3072 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-minife_libfabric-cpu-zen4.sif miniFE.x nx=230 ny=230 nz=230 use_locking=1 elem_group_size=10 use_elem_mat_fields=300 verify_solution=0
time flux run --setattr=user.study_id=$app-64-iter-$i -N64 -n 6144 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-minife_libfabric-cpu-zen4.sif miniFE.x nx=230 ny=230 nz=230 use_locking=1 elem_group_size=10 use_elem_mat_fields=300 verify_solution=0
time flux run --setattr=user.study_id=$app-128-iter-$i -N128 -n 12288 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-minife_libfabric-cpu-zen4.sif miniFE.x nx=230 ny=230 nz=230 use_locking=1 elem_group_size=10 use_elem_mat_fields=300 verify_solution=0
time flux run --setattr=user.study_id=$app-256-iter-$i -N256 -n 24576 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-minife_libfabric-cpu-zen4.sif miniFE.x nx=230 ny=230 nz=230 use_locking=1 elem_group_size=10 use_elem_mat_fields=300 verify_solution=0
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
flux proxy local:///mnt/flux/view/run/flux/local bash
Testing:
time flux run -l -N2 mixbench-cpu 64
oras login ghcr.io --username vsoch
app=mixbench
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --setattr=user.study_id=$app-$size-iter-$i -l -N2 singularity exec /home/ubuntu/metric-mixbench_libfabric-cpu.sif mixbench-cpu 64
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
Ani is testing the MPI variant and then we will update here.
Testing:
# runs but output is gibberish (7 seconds)
time flux run -N3 -n 288 -o cpu-affinity=per-task singularity exec /home/ubuntu/mt-gemm_libfabric-cpu-zen4.sif /opt/dense_linear_algebra/gemm/mpi/build/1_dense_gemm_mpi
oras login ghcr.io --username vsoch
app=mt-gem
for i in $(seq 1 2); do
echo "Running iteration $i"
# 9 seconds
time flux run -N4 -n 384 -o cpu-affinity=per-task singularity exec /home/ubuntu/mt-gemm_libfabric-cpu-zen4.sif /opt/dense_linear_algebra/gemm/mpi/build/1_dense_gemm_mpi
# 9.66 seconds
time flux run -N4 -n 384 singularity exec /home/ubuntu/mt-gemm_libfabric-cpu-zen4.sif /opt/dense_linear_algebra/gemm/mpi/build/1_dense_gemm_mpi
# 8.34 seconds
time flux run -N2 -n 192 -o cpu-affinity=per-task singularity exec /home/ubuntu/mt-gemm_libfabric-cpu-zen4.sif /opt/dense_linear_algebra/gemm/mpi/build/1_dense_gemm_mpi
# 8.66 seconds
time flux run -N2 -n 192 singularity exec /home/ubuntu/mt-gemm_libfabric-cpu-zen4.sif /opt/dense_linear_algebra/gemm/mpi/build/1_dense_gemm_mpi
done
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
Write this script to the filesystem flux_run_combinations.sh
#/bin/bash
nodes=$1
app=$2
# At most 28 combinations, 8 nodes 2 at a time
hosts=$(flux run -N $1 hostname | shuf -n 28 | tr '\n' ' ')
list=${hosts}
dequeue_from_list() {
shift;
list=$@
}
iter=0
for i in $hosts; do
dequeue_from_list $list
for j in $list; do
echo "${i} ${j}"
iter="${i}-${j}"
time flux run -N 2 -n 2 \
--setattr=user.study_id=$app-2-iter-$iter \
--requires="hosts:${i},${j}" \
-o cpu-affinity=per-task \
singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/pt2pt/osu_latency
time flux run -N 2 -n 2 \
--setattr=user.study_id=$app-2-iter-$iter \
--requires="hosts:${i},${j}" \
-o cpu-affinity=per-task \
singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/pt2pt/osu_bw
iter=$((iter+1))
done
done
Testing:
./flux_run_combinations.sh 3 $app
# 10 seconds
time flux run -N3 -n 288 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
And then run as follows.
oras login ghcr.io --username vsoch
app=osu
./flux_run_combinations.sh 32 $app
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --setattr=user.study_id=$app-32-iter-$i -N32 -n 3072 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
time flux run --setattr=user.study_id=$app-64-iter-$i -N64 -n 6144 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
time flux run --setattr=user.study_id=$app-128-iter-$i -N128 -n 12288 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
time flux run --setattr=user.study_id=$app-256-iter-$i -N256 -n 24576 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-osu-cpu_libfabric-zen4.sif /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
For testing I used the smaller problem size for AKS from Abhik:
time flux run --cores-per-task 3 --env OMP_NUM_THREADS=3 -N2 -n 64 singularity exec /home/ubuntu/metric-quicksilver-cpu_libfabric-zen4.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 64 -Y 32 -Z 32 -x 64 -y 32 -z 32 -I 4 -J 4 -K 4 -n 10485760
time flux run --env OMP_NUM_THREADS=3 -N32 -n 1024 singularity exec /home/ubuntu/metric-quicksilver-cpu_libfabric-zen4.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 128 -Y 128 -Z 64 -x 128 -y 128 -z 64 -I 16 -J 8 -K 8 -n 335544320
That seemed to start working (the matrix started getting printed), but I didn't want to wait for it to finish.
oras login ghcr.io --username vsoch
app=quicksilver
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-32-iter-$i -N32 -n 1024 singularity exec /home/ubuntu/metric-quicksilver-cpu_libfabric-zen4.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 128 -Y 128 -Z 64 -x 128 -y 128 -z 64 -I 16 -J 8 -K 8 -n 335544320
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-64-iter-$i -N64 -n 2048 singularity exec /home/ubuntu/metric-quicksilver-cpu_libfabric-zen4.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 128 -Y 128 -Z 128 -x 128 -y 128 -z 128 -I 16 -J 16 -K 8 -n 671088640
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-128-iter-$i -N128 -n 4096 singularity exec /home/ubuntu/metric-quicksilver-cpu_libfabric-zen4.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 256 -Y 128 -Z 128 -x 256 -y 128 -z 128 -I 16 -J 16 -K 16 -n 1342117280
time flux run --env OMP_NUM_THREADS=3 --setattr=user.study_id=$app-256-iter-$i -N256 -n 8192 singularity exec /home/ubuntu/metric-quicksilver-cpu_libfabric-zen4.sif qs --inputFile /opt/quicksilver/Examples/CORAL2_Benchmark/Problem1/Coral2_P1.inp -X 256 -Y 256 -Z 128 -x 256 -y 256 -z 128 -I 32 -J 16 -K 16 -n 2684354560
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
Testing:
# 4 seconds
time flux run -N1 -n 96 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-stream_libfabric-cpu-zen4.sif stream_c.exe
oras login ghcr.io --username vsoch
app=stream
mkdir -p $output
for i in $(seq 1 5); do
echo "Running iteration $i"
time flux run --setattr=user.study_id=$app-1-iter-$i -N1 -n 96 -o cpu-affinity=per-task singularity exec /home/ubuntu/metric-stream_libfabric-cpu-zen4.sif stream_c.exe
done
# When they are done:
./save.sh $output
oras push ghcr.io/converged-computing/metrics-operator-experiments/performance:ec2-cpu-$app $output
When you are done, exit and:
make destroy
And don't forget to type "yes" !