From c42bb88e999d8b817a25a8e555693f2b1c70676c Mon Sep 17 00:00:00 2001 From: JoshKarpel Date: Mon, 15 Apr 2019 13:41:59 -0500 Subject: [PATCH 1/6] revised run_with_singularity based on Greg's advice --- htmap/run/run_with_singularity.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/htmap/run/run_with_singularity.sh b/htmap/run/run_with_singularity.sh index e64295fe..6e36fc51 100644 --- a/htmap/run/run_with_singularity.sh +++ b/htmap/run/run_with_singularity.sh @@ -6,6 +6,8 @@ img=$1 component=$2 # would otherwise default to user home dir -export SINGULARITY_CACHEDIR=${_CONDOR_SCRATCH_DIR} +d=${_CONDOR_SCRATCH_DIR}/.htmap_singularity +mkdir ${d} +export SINGULARITY_CACHEDIR=${d} -singularity exec --bind ${_CONDOR_SCRATCH_DIR}:/htmap/scratch ${img} bash -c "cd /htmap/scratch && python3 run.py ${component}" +singularity exec --bind ${_CONDOR_SCRATCH_DIR}:/tmp --workdir /tmp ${img} bash -c "python3 run.py ${component}" From 930a5d3f819ea4507624dc02dd43faa994270886 Mon Sep 17 00:00:00 2001 From: JoshKarpel Date: Mon, 15 Apr 2019 13:58:32 -0500 Subject: [PATCH 2/6] use --contain to not auto-bind /tmp --- htmap/run/run_with_singularity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htmap/run/run_with_singularity.sh b/htmap/run/run_with_singularity.sh index 6e36fc51..d788bfd1 100644 --- a/htmap/run/run_with_singularity.sh +++ b/htmap/run/run_with_singularity.sh @@ -10,4 +10,4 @@ d=${_CONDOR_SCRATCH_DIR}/.htmap_singularity mkdir ${d} export SINGULARITY_CACHEDIR=${d} -singularity exec --bind ${_CONDOR_SCRATCH_DIR}:/tmp --workdir /tmp ${img} bash -c "python3 run.py ${component}" +singularity exec --contain --bind ${_CONDOR_SCRATCH_DIR}:/tmp --workdir /tmp ${img} bash -c "python3 run.py ${component}" From 0ce1c5e821f8c456530f22567728fe0507a06055 Mon Sep 17 00:00:00 2001 From: JoshKarpel Date: Tue, 21 May 2019 10:01:27 -0500 Subject: [PATCH 3/6] resolves #128, other small cleanup work --- docker/.htmaprc | 3 +++ docker/condor_config.local | 12 ++++++------ htmap/run/run.py | 7 ++++--- tests/cli/test_rerun.py | 6 +++--- tests/integration/test_held_components.py | 2 +- tests/integration/test_usage_tracking.py | 2 +- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/docker/.htmaprc b/docker/.htmaprc index f5fcf97c..4754d7f3 100644 --- a/docker/.htmaprc +++ b/docker/.htmaprc @@ -1 +1,4 @@ DELIVERY_METHOD = "assume" + +[MAP_OPTIONS] +REQUEST_DISK = "100MB" diff --git a/docker/condor_config.local b/docker/condor_config.local index 260cfca4..6ea38045 100644 --- a/docker/condor_config.local +++ b/docker/condor_config.local @@ -14,12 +14,12 @@ EXECUTE=$(LOCAL_DIR)/execute CRED_STORE_DIR=$(LOCAL_DIR)/cred_dir # Tuning so jobs start quickly -SCHEDD_INTERVAL=5 -NEGOTIATOR_INTERVAL=2 -NEGOTIATOR_CYCLE_DELAY=5 -STARTER_UPDATE_INTERVAL=5 -SHADOW_QUEUE_UPDATE_INTERVAL=10 -UPDATE_INTERVAL=5 +SCHEDD_INTERVAL=1 +NEGOTIATOR_INTERVAL=1 +NEGOTIATOR_CYCLE_DELAY=1 +STARTER_UPDATE_INTERVAL=1 +SHADOW_QUEUE_UPDATE_INTERVAL=1 +UPDATE_INTERVAL=1 RUNBENCHMARKS=0 # Don't use all the machine resources diff --git a/htmap/run/run.py b/htmap/run/run.py index f38357ee..b70aac6e 100644 --- a/htmap/run/run.py +++ b/htmap/run/run.py @@ -179,8 +179,9 @@ def load_checkpoint(scratch_dir, transfer_dir): old_dir.rename(transfer_dir / curr_dir.name) -def clean_and_remake_dir(dir): - shutil.rmtree(dir, ignore_errors = True) +def clean_and_remake_dir(dir: Path): + if dir.exists(): + shutil.rmtree(dir) dir.mkdir() @@ -217,7 +218,6 @@ def main(component): result_or_error = func(*args, **kwargs) status = 'OK' print('\n----- MAP COMPONENT OUTPUT END -----\n') - except Exception as e: print('\n------- MAP COMPONENT ERROR --------\n') @@ -235,6 +235,7 @@ def main(component): ) status = 'ERR' + clean_and_remake_dir(scratch_dir / CHECKPOINT_CURRENT) clean_and_remake_dir(transfer_dir) save_output(component, status, result_or_error, transfer_dir) diff --git a/tests/cli/test_rerun.py b/tests/cli/test_rerun.py index 52d1ccd8..325414a6 100644 --- a/tests/cli/test_rerun.py +++ b/tests/cli/test_rerun.py @@ -20,7 +20,7 @@ def test_rerun_map(cli): m = htmap.map(str, range(1)) - m.wait() + m.wait(180) result = cli(['rerun', 'map', m.tag]) m.wait(180) @@ -30,7 +30,7 @@ def test_rerun_map(cli): def test_rerun_components(cli): m = htmap.map(str, [0, 1]) - m.wait() + m.wait(180) result = cli(['rerun', 'components', m.tag, '0 1']) m.wait(180) @@ -41,7 +41,7 @@ def test_rerun_components(cli): def test_rerun_components_out_range_cannot_rerun(cli): m = htmap.map(str, [0]) - m.wait() + m.wait(180) result = cli(['rerun', 'components', m.tag, '5']) diff --git a/tests/integration/test_held_components.py b/tests/integration/test_held_components.py index b022b088..92114859 100644 --- a/tests/integration/test_held_components.py +++ b/tests/integration/test_held_components.py @@ -27,7 +27,7 @@ def test_waiting_on_held_component_raises(mapped_doubler): time.sleep(1) # wait for it to propagate with pytest.raises(htmap.exceptions.MapComponentHeld): - m.wait() + m.wait(timeout = 180) def test_getting_held_component_raises(mapped_doubler): diff --git a/tests/integration/test_usage_tracking.py b/tests/integration/test_usage_tracking.py index 32ffe29b..0fe92aee 100644 --- a/tests/integration/test_usage_tracking.py +++ b/tests/integration/test_usage_tracking.py @@ -24,7 +24,7 @@ def test_memory_usage_is_nonzero_after_map_complete(): # need it run for at least 5 seconds for it generate an image size event m = htmap.map(lambda x: time.sleep(10), [None]) - m.wait() + m.wait(timeout = 180) print(m.memory_usage) assert all(x > 0 for x in m.memory_usage) From 0d4e44328c2eedb264e43d730f23b2ad0371d7d6 Mon Sep 17 00:00:00 2001 From: JoshKarpel Date: Tue, 21 May 2019 10:03:48 -0500 Subject: [PATCH 4/6] version history for #128 --- docs/source/versions/v0_3_2.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 docs/source/versions/v0_3_2.rst diff --git a/docs/source/versions/v0_3_2.rst b/docs/source/versions/v0_3_2.rst new file mode 100644 index 00000000..527d1c77 --- /dev/null +++ b/docs/source/versions/v0_3_2.rst @@ -0,0 +1,23 @@ +v0.3.2 +====== + +New Features +------------ + +Bug Fixes +--------- + +* Hopefully finally resolved a recurring issue with checkpoint directories being + returned to the submit node after execution errors. + Issue: https://github.com/htcondor/htmap/issues/128 + +Known Issues +------------ + +* Execution errors that result in the job being terminated but no output being + produced are still not handled entirely gracefully. Right now, the component + state will just show as ``ERRORED``, but there won't be an actual error report. +* Map component state may become corrupted when a map is manually vacated. + Force-removal may be needed to clean up maps if HTCondor and HTMap disagree + about the state of their components. + Issue: https://github.com/htcondor/htmap/issues/129 From 6d59ee3212e3f4bfd38fa8a44b610265f0053521 Mon Sep 17 00:00:00 2001 From: JoshKarpel Date: Tue, 21 May 2019 21:14:55 -0500 Subject: [PATCH 5/6] version --- htmap/__init__.py | 2 +- htmap/run/run_with_singularity.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/htmap/__init__.py b/htmap/__init__.py index 4bf972cd..f6c9ce51 100644 --- a/htmap/__init__.py +++ b/htmap/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '0.3.1' +__version__ = '0.3.2' from typing import Tuple as _Tuple import logging as _logging diff --git a/htmap/run/run_with_singularity.sh b/htmap/run/run_with_singularity.sh index d788bfd1..f92daff4 100644 --- a/htmap/run/run_with_singularity.sh +++ b/htmap/run/run_with_singularity.sh @@ -5,7 +5,7 @@ set -e img=$1 component=$2 -# would otherwise default to user home dir +# singularity cachedir would otherwise default to user home dir d=${_CONDOR_SCRATCH_DIR}/.htmap_singularity mkdir ${d} export SINGULARITY_CACHEDIR=${d} From 43ba0f8c841fe362e5d58e71df319d7ce0aaeee6 Mon Sep 17 00:00:00 2001 From: JoshKarpel Date: Thu, 23 May 2019 22:47:38 -0500 Subject: [PATCH 6/6] add pre-docs... --- docs/source/dependencies.rst | 6 ------ docs/source/versions/v0_3_2.rst | 3 +++ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/source/dependencies.rst b/docs/source/dependencies.rst index b2cdeb73..6f2f60fb 100644 --- a/docs/source/dependencies.rst +++ b/docs/source/dependencies.rst @@ -141,12 +141,6 @@ If you want to use your own Singularity image, just change the ``'SINGULARITY.IM When using this delivery method, HTMap will discover ``python3`` on the system ``PATH`` and use that to run your code. -.. warning:: - - This delivery method relies on the directory ``/htmap/scratch`` either existing in the Singularity image, or Singularity being able to run with ``overlayfs``. - If you get a ``stderr`` message from Singularity about a bind mount directory not existing, that's the problem. - - Assume Dependencies are Present ------------------------------- diff --git a/docs/source/versions/v0_3_2.rst b/docs/source/versions/v0_3_2.rst index 527d1c77..0c4a4646 100644 --- a/docs/source/versions/v0_3_2.rst +++ b/docs/source/versions/v0_3_2.rst @@ -4,6 +4,9 @@ v0.3.2 New Features ------------ +* Singularity delivery no longer requires a specially-named directory in the + container and/or overlays. + Bug Fixes ---------