Skip to content

Commit

Permalink
Merge branch 'RESTAPI-950-remove-home-error' into 'master'
Browse files Browse the repository at this point in the history
Remove home not mounted error

See merge request firecrest/firecrest!318
  • Loading branch information
Juan Pablo Dorsch committed Sep 11, 2024
2 parents e96570f + 2d529f2 commit 1fb8775
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 12 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Support for multiple JWT signature algorithms
- Added option to follow symbolic links in the `POST /utilities/compress` and `POST /storage/xfer-internal/compress` endpoints
- Added new "general" section to status/parameters describing `FIRECREST_VERSION` and `FIRECREST_BUILD` timestamp
- Environment variable `F7T_HOME_ENABLED` to set `False` if `$HOME` is not mounted on systems executing FirecREST commands

### Changed

Expand All @@ -27,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix parsing in `GET /utilities/ls` endpoint.
- The job fields `job_data_out` and `job_file_err` from `GET /compute/jobs` will be empty for jobs that are still pending (so that there is no confusion with older output/error files).
- Added retry on task creation workflow
- Error message when `$HOME` is not mounted

## [1.16.0]

Expand Down
1 change: 1 addition & 0 deletions deploy/demo/common/common.env
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ F7T_SYSTEMS_INTERNAL_ADDR='192.168.220.12:22;192.168.220.12:22'
#F7T_SYSTEMS_INTERNAL_COMPUTE_ADDR='192.168.220.12:22;192.168.220.12:22'
#F7T_SYSTEMS_INTERNAL_STORAGE_ADDR='192.168.220.12:22;192.168.220.12:22'
#F7T_SYSTEMS_INTERNAL_UTILITIES_ADDR='192.168.220.12:22;192.168.220.12:22'
#F7T_HOME_ENABLED=True
#-------
# COMPUTE options
# Base filesystem where job submission files will be stored.
Expand Down
1 change: 1 addition & 0 deletions deploy/k8s/config/templates/cm.common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ data:
F7T_SSL_ENABLED: {{ .Values.F7T_SSL_ENABLED | default "true" | quote }}
F7T_SSL_CRT: {{ .Values.F7T_SSL_CRT | default "" | quote }}
F7T_SSL_KEY: {{ .Values.F7T_SSL_KEY | default "" | quote }}
F7T_HOME_ENABLED: {{ .Values.F7T_HOME_ENABLED | default "True" | quote }}
F7T_SYSTEMS_INTERNAL_ADDR: {{ .Values.F7T_SYSTEMS_INTERNAL_ADDR | quote }}
F7T_SYSTEMS_INTERNAL_STATUS_ADDR: {{ .Values.F7T_SYSTEMS_INTERNAL_STATUS_ADDR | default .Values.F7T_SYSTEMS_INTERNAL_ADDR | quote }}
F7T_SYSTEMS_INTERNAL_COMPUTE_ADDR: {{ .Values.F7T_SYSTEMS_INTERNAL_COMPUTE_ADDR | default .Values.F7T_SYSTEMS_INTERNAL_ADDR | quote }}
Expand Down
1 change: 1 addition & 0 deletions doc/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ The most complete way of installing is to setup 3 hosts:
|`F7T_UTILITIES_TIMEOUT` | NO | `5` | Value in **seconds** for timing out a login node command using `/utilities` | `Backend` |
|`F7T_PERSIST_HOST` | NO | `'127.0.0.1'` | Hostname or IP of the redis database used in `taskpersistence` container | `Backend` | Replaces `F7T_PERSISTENCE_IP` |
|`F7T_PERSIST_PORT` | NO | `'6379'` | Port number of the redis database used in `taskpersistence` container | `Backend` |
|`F7T_HOME_ENABLED` | NO | `True` | Set to `True` if the `$HOME` directory is mounted on the systems interfacing FirecREST | `Backend` |
|`F7T_SPANK_PLUGIN_ENABLED` | NO | `False` | Set to `True` if the system scheduler uses a [spank](https://slurm.schedmd.com/spank.html) when submitting jobs. If there is more than one system configured, there should be a semicolon separated list in relative order to `F7T_SYSTEMS_PUBLIC_NAME` values | `Backend`| Replaces `F7T_USE_SPANK_PLUGIN` |
|`F7T_SPANK_PLUGIN_OPTION` | only if `F7T_SPANK_PLUGIN_ENABLED=True` | `--nohome`| Name of the option to use in the workload manager command. If there is more than one system configured, there should be a semicolon separated list in relative order to `F7T_SYSTEMS_PUBLIC_NAME` values | `Backend`|
|`F7T_COMPUTE_SCHEDULER` | NO | `'Slurm'`| Set to the name of the of the Workload Manager scheduler adapter class. By default it can be found in `/src/common/schedulers` | `Backend`|
Expand Down
13 changes: 8 additions & 5 deletions src/common/cscs_api_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ def get_null_var(var):
OPA_URL = os.environ.get("F7T_OPA_URL","http://localhost:8181").strip('\'"')
OPA_POLICY_PATH = os.environ.get("F7T_OPA_POLICY_PATH","v1/data/f7t/authz").strip('\'"')


HOME_ENABLED = get_boolean_var(
os.environ.get("F7T_HOME_ENABLED", True))

### SSH key paths
PUB_USER_KEY_PATH = os.environ.get("F7T_PUB_USER_KEY_PATH", "/user-key.pub")
Expand Down Expand Up @@ -363,7 +364,7 @@ def create_certificate(headers, cluster_name, cluster_addr, command=None, option


# execute remote commands with Paramiko:
def exec_remote_command(headers, system_name, system_addr, action, file_transfer=None, file_content=None, no_home=False):
def exec_remote_command(headers, system_name, system_addr, action, file_transfer=None, file_content=None):

import paramiko, socket

Expand Down Expand Up @@ -530,14 +531,16 @@ def exec_remote_command(headers, system_name, system_addr, action, file_transfer
else:
result = {"error": 0, "msg": outlines}
elif stderr_errno > 0:
# Solving when stderr_errno = 1 and no_home plugin used (F7T_SPANK_PLUGIN_ENABLED)
# Solving when stderr_errno = 1 and $HOME is not mounted:
# stderr_errno = 1
# stderr_errda = "Could not chdir to home directory /users/eirinik: No such file or directory
# ERROR: you must specify a project account (-A <account>)sbatch: error: cli_filter plugin terminated with error"
if no_home and in_str(stderr_errda,"Could not chdir to home directory"):
if not HOME_ENABLED and in_str(stderr_errda, "Could not chdir to home directory"):
# checking for 2nd 'directory' string (first is at index 33)
# 2nd comes after username
idx = stderr_errda.index("directory",33)
logging.info(f"$HOME directory is not enabled"
f" (F7T_HOME_ENABLED={HOME_ENABLED})")
idx = stderr_errda.index("directory", 33)
# len(directory) = 9
result = {"error": stderr_errno, "msg": stderr_errda[idx+9:]}

Expand Down
14 changes: 7 additions & 7 deletions src/compute/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def submit_job_task(headers, system_name, system_addr, job_file, job_dir, accoun
ID = headers.get(TRACER_HEADER, '')
# create tmpdir for sbatch file
action = f"ID={ID} timeout {UTILITIES_TIMEOUT} mkdir -p -- '{job_dir}'"
retval = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
retval = exec_remote_command(headers, system_name, system_addr, action)

if retval["error"] != 0:
app.logger.error(f"(Error creating directory: {retval['msg']}")
Expand All @@ -158,7 +158,7 @@ def submit_job_task(headers, system_name, system_addr, job_file, job_dir, accoun
# save the sbatch file in the target cluster FS
if job_file['content']:
action = f"ID={ID} cat > '{job_dir}/{job_file['filename']}'"
retval = exec_remote_command(headers, system_name, system_addr, action, file_transfer="upload", file_content=job_file['content'], no_home=use_plugin)
retval = exec_remote_command(headers, system_name, system_addr, action, file_transfer="upload", file_content=job_file['content'])
if retval["error"] != 0:
app.logger.error(f"(Error uploading file: {retval['msg']}")
update_task(task_id, headers, async_task.ERROR, "Failed to upload file")
Expand All @@ -171,7 +171,7 @@ def submit_job_task(headers, system_name, system_addr, job_file, job_dir, accoun
action = f"ID={ID} {scheduler_command}"
app.logger.info(action)

retval = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
retval = exec_remote_command(headers, system_name, system_addr, action)

if retval["error"] != 0:
app.logger.error(f"(Error: {retval['msg']}")
Expand Down Expand Up @@ -231,7 +231,7 @@ def get_job_files(headers, system_name, system_addr, job_info, output=False, use

for n_try in range(n_tries):

resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
resp = exec_remote_command(headers, system_name, system_addr, action)

# if there was an error, the result will be SUCESS but not available outputs
if resp["error"] == 0:
Expand Down Expand Up @@ -265,12 +265,12 @@ def get_job_files(headers, system_name, system_addr, job_info, output=False, use
# tail -c {number_of_bytes} --> 1000B = 1KB

action = f"ID={ID} timeout {UTILITIES_TIMEOUT} tail -c {TAIL_BYTES} -- '{control_info['job_file_out']}'"
resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
resp = exec_remote_command(headers, system_name, system_addr, action)
if resp["error"] == 0:
control_info["job_data_out"] = resp["msg"]

action = f"ID={ID} timeout {UTILITIES_TIMEOUT} tail -c {TAIL_BYTES} -- '{control_info['job_file_err']}'"
resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
resp = exec_remote_command(headers, system_name, system_addr, action)
if resp["error"] == 0:
control_info["job_data_err"] = resp["msg"]

Expand All @@ -287,7 +287,7 @@ def submit_job_path_task(headers, system_name, system_addr, fileName, job_dir, a
action=f"ID={ID} {scheduler_command}"
app.logger.info(action)

resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
resp = exec_remote_command(headers, system_name, system_addr, action)

# in case of error:
if resp["error"] != 0:
Expand Down

0 comments on commit 1fb8775

Please sign in to comment.