-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
58709e6
commit f5e8a23
Showing
5 changed files
with
168 additions
and
9 deletions.
There are no files selected for viewing
17 changes: 8 additions & 9 deletions
17
dmsbatch/templates/alma87_mvapich2_20240426/autoscale_formula.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,10 @@ | ||
// Define time and task limits | ||
timesince = time() - time("{startTime}"); | ||
timelimit = TimeInterval_Minute * 10; | ||
// Get pending tasks for the past 5 minutes. Batch doesn't check quicker than 5 minutes anyways | ||
$samples = $PendingTasks.GetSamplePercent(TimeInterval_Second * 1); | ||
// If you have fewer than 70 percent data points, use the last sample point, | ||
// otherwise use the maximum of last sample point and the history average. | ||
$tasks = $samples < 70 ? max(0,$PendingTasks.GetSample(1)) : max( $PendingTasks.GetSample(1), avg($PendingTasks.GetSample(TimeInterval_Second * 1))); | ||
// For multi instance tasks, set targetVMs to {num_hosts}, otherwise 0. | ||
${nodeType} = (timesince > timelimit ? ($tasks > 0 ? {num_hosts}: 0): {num_hosts}); | ||
// Set node deallocation mode - let running tasks finish before removing a node | ||
$NodeDeallocationOption = taskcompletion; | ||
// Get the average pending tasks over the past minute, with smoothing | ||
$samples = $PendingTasks.GetSamplePercent(TimeInterval_Minute); | ||
// If data points are fewer than 70%, use the last point, otherwise, take the max of last point and average | ||
$tasks = $samples < 70 ? max(0, $PendingTasks.GetSample(1)) : max($PendingTasks.GetSample(1), avg($PendingTasks.GetSample(TimeInterval_Minute))); | ||
${nodeType} = timesince > timelimit ? ($tasks > 0 ? {num_hosts}: 0): {num_hosts}; | ||
// Node deallocation - finish tasks before removing nodes | ||
$NodeDeallocationOption = taskcompletion; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#!/bin/bash | ||
################################################################################ | ||
# Script: combine_hotstart.sh | ||
# Description: This script prepares a hotstart file for a simulation based on | ||
# user input. | ||
################################################################################ | ||
|
||
# Usage function | ||
usage() { | ||
echo "Usage: $0 <days>" | ||
echo " <days>: Number of days for the simulation." | ||
echo "" | ||
echo " Prepare a hotstart file for a simulation based on user input." | ||
echo "" | ||
echo " The script will look for the last uncombined hotstart from outputs directory" | ||
echo " and compare with the user input. If the user input is greater than the last" | ||
echo " available hotstart files, the last hotstart file will be used instead." | ||
echo " The script needs to be run in a study directory where the outputs directory is located." | ||
} | ||
|
||
# Check if the user provided the number of days as an argument | ||
if [ -z "$1" ]; then | ||
usage | ||
exit 1 | ||
fi | ||
|
||
simulation_days="$1" | ||
|
||
# NOTE that the following variables are hard-coded. | ||
SECONDS_PER_DAY=86400 | ||
NHOT_WRITE=4800 | ||
DT=90 | ||
|
||
# Directory containing uncombined hotstart files. Hard-coded. | ||
DIR_OUTPUTS="outputs" | ||
|
||
# Find the most recent file whose name begins with "hotstart_000000" | ||
recent_file=$(ls -t "${DIR_OUTPUTS}"/hotstart_000000* 2>/dev/null | head -n1) | ||
|
||
# Check if a file is found | ||
if [ -n "$recent_file" ]; then | ||
# Extract string following "_" from the file name | ||
extracted_string=$(echo "$recent_file" | sed 's/^.*000000_//') | ||
else | ||
echo "No files found whose name begins with 'hotstart' in $DIR_OUTPUTS" | ||
exit 1 | ||
fi | ||
|
||
last_iteration=$(echo "$extracted_string" | grep -oE '[0-9]+\.nc' | sed 's/\.nc//') | ||
last_day=$((($last_iteration * $DT) / $SECONDS_PER_DAY)) | ||
|
||
|
||
# If simulation_days is negative, use the iterations read from mirror.out | ||
if [ "$simulation_days" -ge 0 ]; then | ||
|
||
# Obtain valid output date closest to user input | ||
output_interval=$((($NHOT_WRITE * $DT) / $SECONDS_PER_DAY)) | ||
simulation_days=$((($simulation_days / $output_interval) * $output_interval)) | ||
|
||
# Obtain number of iterations | ||
iterations=$((($simulation_days * SECONDS_PER_DAY) / DT)) | ||
|
||
# If the requested iteration does not exist, use the latest file instead. | ||
if [ "$iterations" -gt "$last_iteration" ]; then | ||
echo "There is no hotstart input with it=$iterations (day $simulation_days)." | ||
echo "Using the most recently generated hotstart input instead: it=$last_iteration (day $last_day)." | ||
iterations=$last_iteration | ||
simulation_days=$last_day | ||
fi | ||
|
||
else | ||
echo "Reading number of iterations from mirror.out" | ||
iterations=$(tail -100 "${DIR_OUTPUTS}"/mirror.out | grep "TIME STEP" | awk '{print $3}' | tr -d ';' | tail -1) | ||
iterations=$((($iterations/$NHOT_WRITE)*$NHOT_WRITE)) | ||
echo $iterations | ||
# if iterations is blank, exit with message | ||
if [ -z "$iterations" ]; then | ||
echo "No iterations found in mirror.out" | ||
exit 2 | ||
fi | ||
fi | ||
|
||
# Generate hotstart file | ||
echo "Generating 'hotstart_it=$iterations.nc' (day $simulation_days)" | ||
cd $DIR_OUTPUTS | ||
hotstart_input="hotstart_000000_$iterations.nc" | ||
if [ ! -f "$hotstart_input" ]; then | ||
echo "Hotstart files to combine not found: $DIR_OUTPUTS/$hotstart_input" | ||
echo "Please check the outputs directory." | ||
exit 1 | ||
fi | ||
combine_hotstart7 -i $iterations | ||
HOTSTART_OUTPUT="hotstart_it=$iterations.nc" | ||
if [ -f "$HOTSTART_OUTPUT" ]; then | ||
echo "Hotstart file generated: $HOTSTART_OUTPUT" | ||
cd .. | ||
ln -sf "${DIR_OUTPUTS}/${HOTSTART_OUTPUT}" hotstart.nc | ||
else | ||
echo "Failed to generate hotstart file: $HOTSTART_OUTPUT" | ||
exit 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
iterations=$1; | ||
hotstart_input="hotstart_000000_$iterations.nc"; | ||
cd outputs; | ||
combine_hotstart7 -i $iterations; | ||
HOTSTART_OUTPUT="hotstart_it=$iterations.nc" | ||
cd .. | ||
if [ -f "outputs/$HOTSTART_OUTPUT" ]; then | ||
echo "Hotstart file generated: $HOTSTART_OUTPUT" | ||
ln -sf "outputs/${HOTSTART_OUTPUT}" hotstart.nc | ||
else | ||
echo "Failed to generate hotstart file: $HOTSTART_OUTPUT" | ||
echo "Continuing with existing setup" | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
export NHOT_WRITE=4800; | ||
iterations=$(tail -100 outputs/mirror.out | grep "TIME STEP" | awk '{print $3}' | tr -d ';' | tail -1); | ||
iterations=$((($iterations/$NHOT_WRITE)*$NHOT_WRITE)); | ||
echo $iterations; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
resource_group: dwrbdo_dsp # the resource group containing the batch account | ||
job_name: spot_dsp_2024_suisun_lhc_3 # job name, will be used to name the pool and the job | ||
batch_account_name: dwrbdodspbatch # batch account name | ||
storage_account_name: dwrbdodspsa # this is the storage account containing batch and storage_container defined below | ||
storage_container_name: test # this is mounted to $AZ_BATCH_MOUNTS_DIR/<<storage_container_name>> in addition to batch container which is mounted to $AZ_BATCH_MOUNTS_DIR/batch | ||
study_copy_flags: --recursive --preserve-symlinks --exclude-regex "outputs.*/.*nc" | ||
setup_dirs_copy_flags: --recursive --preserve-symlinks --include-regex "outputs_tropic/out2d_.*.nc;outputs_tropic/horizontalVel.*.nc;hgrid.*;.*vgrid.*;.*narr.*.nc;.*baydelta_schism.*.nc" | ||
study_dir: azure_dsp_2024_lhc_v3/simulations/suisun_lhc_3 # where the relative study directory for the simulation will be | ||
setup_dirs: # these are directories that are also copied in addition to the study_dir | ||
- atmos # try to avoid copying this and use symlink to mounted container instead (see mpi_command below) | ||
# - azure_dsp_2024_lhc_v3/simulations/baseline_lhc_3 | ||
num_hosts: 2 # number of nodes in the pool | ||
# num_cores: <<number of cores total>> # is optional as default is number of cores per host * number of hosts | ||
num_scribes: 10 # This is used in the mpi_cmd template if referred to there | ||
# command to run , assume the study_dir is current directory | ||
node_type: 'TargetLowPriorityNodes' # 'TargetDedicatedNodes' # 'TargetLowPriorityNodes' | ||
max_task_retry_count: 3 | ||
mpi_opts: --bind-to core | ||
task_slots_per_node: 1 | ||
mpi_command: | | ||
cd sflux; | ||
rm -f *.nc; | ||
python make_links_az.py; | ||
cd ../; | ||
echo "SFLUX LINKS MADE"; | ||
source dsp_suisun_lhc_3.clinic.from_baseline.sh no-interp; | ||
if [ -f $AZ_BATCH_TASK_ID.state.txt ]; then | ||
echo $(date) >> $AZ_BATCH_TASK_ID.state.txt; | ||
current_date=$(date); | ||
echo "Restarting from previous run: $current_date"; | ||
iterations=$($SCHISM_SCRIPTS_HOME/batch/get_iterations_from_mirror.sh); | ||
echo "mirror.out @ $iterations. Copying hotstart_\d+_$iterations\.nc files"; | ||
azcopy copy --include-regex="hotstart_\d+_$iterations\.nc" --recursive "https://{storage_account_name}.blob.core.windows.net/{storage_container_name}/{study_dir}/outputs/?{sas}" . || true; | ||
echo "Generating and linking hotstart for $iterations"; | ||
$SCHISM_SCRIPTS_HOME/batch/generate_and_link_hotstart.sh $iterations; | ||
else | ||
echo $(date) > $AZ_BATCH_TASK_ID.state.txt; | ||
fi | ||
mpirun -np {num_cores} -f hostfile {mpi_opts} pschism_PREC_EVAP_GOTM_TVD-VL {num_scribes} | ||
# template for the pool name, which is used to create the pool with appropriate settings | ||
template_name: "alma87_mvapich2_20240426" # this is the template name for the pool, e.g. "centos7" or "alma8" | ||
delete_after_mins: 600 # delete the job after this many minutes |