Skip to content

Commit a55af9c

Browse files
authored
Merge pull request #71 from ROCm/execOnError
gpurun fallback to exec on ERROR
2 parents c4f01e3 + 1d0c781 commit a55af9c

File tree

1 file changed

+17
-13
lines changed

1 file changed

+17
-13
lines changed

utils/bin/gpurun

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,12 @@
3232
# else.
3333
GPURUN_BYPASS=${GPURUN_BYPASS:-0}
3434

35-
if [ "$GPURUN_BYPASS" = "1" ]; then
35+
function execOnError() {
3636
exec "$@"
37+
}
38+
39+
if [ "$GPURUN_BYPASS" = "1" ]; then
40+
execOnError "$@"
3741
fi
3842

3943
# PROGVERSION string is updated by cmake when component is installed
@@ -220,15 +224,15 @@ fi
220224
if [ ! -d $AOMP ] ; then
221225
>&2 echo "ERROR: AOMP not found at $AOMP"
222226
>&2 echo " Please install AOMP or correctly set env-var AOMP"
223-
exit 1
227+
execOnError "$@"
224228
fi
225229
ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo}
226230
[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo
227231
[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo
228232
if [ ! -f $ROCMINFO_BINARY ] ; then
229233
>&2 echo "ERROR: Could not find binary for rocminfo,"
230234
>&2 echo " Please correct installation of ROCM or AOMP compiler"
231-
exit 1
235+
execOnError "$@"
232236
fi
233237

234238
# Use rocminfo to find number number of CUs and gfxids for each GPU.
@@ -238,7 +242,7 @@ _tfile_lines=`wc -l $_tfile | cut -d" " -f1`
238242
if [ $_tfile_lines == 0 ] ; then
239243
>&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices"
240244
rm $_tfile
241-
exit 1
245+
execOnError "$@"
242246
fi
243247
# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device
244248
_ri_all_gfxids=""
@@ -312,9 +316,9 @@ if [ $_ri_num_devices == 0 ] ; then
312316
>&2 echo " ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES"
313317
>&2 echo " Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly."
314318
fi
315-
exit 1
319+
execOnError "$@"
316320
else
317-
exit
321+
execOnError "$@"
318322
fi
319323
fi
320324

@@ -399,21 +403,21 @@ if [[ $_ss_num_devices -lt 1 ]] ; then
399403
else
400404
>&2 echo "ERROR: No amdgpu devices found in $_sysdevdir."
401405
fi
402-
exit 1
406+
execOnError "$@"
403407
fi
404408

405409
# check for taskset or numactl cmd
406410
if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then
407411
_launch_process_cmd_binary=`which numactl`
408412
if [ $? != 0 ] ; then
409413
>&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed."
410-
exit 1
414+
execOnError "$@"
411415
fi
412416
else
413417
_launch_process_cmd_binary=`which taskset`
414418
if [ $? != 0 ] ; then
415419
>&2 echo "ERROR: $0 requires the taskset command to be installed."
416-
exit 1
420+
execOnError "$@"
417421
fi
418422
fi
419423
if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then
@@ -448,19 +452,19 @@ fi
448452
_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
449453
if [ $_num_local_ranks -gt $_node_cus ] ; then
450454
>&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
451-
exit 1
455+
execOnError "$@"
452456
fi
453457

454458
if [ $_uses_multi_device == 1 ]; then
455459
# Enforce some rules on the use of -md option
456460
# Note -md forces GPURUN_MASK_POLICY=nomask
457461
if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then
458462
>&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode"
459-
exit 1
463+
execOnError "$@"
460464
fi
461465
if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then
462466
>&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)"
463-
exit 1
467+
execOnError "$@"
464468
fi
465469
_md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset ))
466470
if [ $_md_total_devices -gt $_ri_num_devices ] && [ $_local_rank_num == 0 ] ; then
@@ -507,7 +511,7 @@ _gfxid=${_ss_gfxid[$_device_num]}
507511
_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
508512
if [ $_num_local_ranks -gt $_node_cus ] ; then
509513
>&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
510-
exit 1
514+
execOnError "$@"
511515
fi
512516

513517
_utilized_CUs_per_device=$_available_CUs_per_device

0 commit comments

Comments
 (0)