From 366ab9b2cd284eb933a3758cdefaab8bfbe675db Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 24 Jul 2025 15:54:24 -0700 Subject: [PATCH 01/10] Add a script to run a job on an arclight ecs container [infra] --- infrastructure/cinco/scripts/arclight_job.py | 151 +++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 infrastructure/cinco/scripts/arclight_job.py diff --git a/infrastructure/cinco/scripts/arclight_job.py b/infrastructure/cinco/scripts/arclight_job.py new file mode 100644 index 00000000..5e7d5f6c --- /dev/null +++ b/infrastructure/cinco/scripts/arclight_job.py @@ -0,0 +1,151 @@ +import argparse +# import time + +import boto3 + + +def task_template(): + return { + "capacityProviderStrategy": [ + {"capacityProvider": "FARGATE", "weight": 1, "base": 1}, + ], + "count": 1, + "platformVersion": "LATEST", + "enableECSManagedTags": True, + "enableExecuteCommand": True, + } + + +def get_service(cluster, env): + ecs_client = boto3.client("ecs", region_name="us-west-2") + arclight_service = ecs_client.describe_services( + cluster=cluster, + services=[f"cinco-arclight-{env}-service"], + ) + if arclight_service["failures"]: + print("Error retrieving ECS service for Arclight:") + print(arclight_service["failures"]) + raise ValueError(f"No service found for {env} environment.") + return arclight_service["services"][0] + + +def get_task_definition( + cluster, env, task_definition_revision=None, latest: bool = False +): + # if the task definition's revision number is explicitly specified, use that + # if --latest flag is specified, run using the latest ACTIVE task_definition + # otherwise, run this task using the same task definition as the arclight + # service. + + # In services, when a task definition is updated (to deploy a new image + # version, for example), the service returns that new task definition + # revision, even if the deployment is still rolling out, so calling this + # script directly after a service update will still use the service's + # most recently specified task definition revision. + + td_family = "cinco-arclight-prd" if env == "prd" else "cinco-arclight-stage" + + if task_definition_revision is not None: + task_definition = f"{td_family}:{task_definition_revision}" + return task_definition + elif latest: + return td_family + else: + arclight_service = get_service(cluster, env) + task_definition = arclight_service["taskDefinition"] + return task_definition.split("/")[-1] + + +def get_service_network_config(cluster, env): + arclight_service = get_service(cluster, env) + network_config = arclight_service["networkConfiguration"] + return network_config + + +def main( + env: str, + command: list[str], + task_definition_revision: int = None, + latest: bool = False, +): + cluster = "cinco-prd" if env == "prd" else "cinco-stage" + task_definition = get_task_definition( + cluster, env, task_definition_revision, latest + ) + network_configuration = get_service_network_config(cluster, env) + + print(f"Running `{' '.join(command)}` on Cinco Arclight {env} in ECS") + + ecs_client = boto3.client("ecs", region_name="us-west-2") + resp = ecs_client.run_task( + **task_template(), + cluster=cluster, + taskDefinition=task_definition, + networkConfiguration=network_configuration, + overrides={ + "containerOverrides": [ + { + "name": f"cinco-arclight-{env}-container", + "command": [*command], + "memory": 2048, + } + ] + }, + ) + task_arn = [task["taskArn"] for task in resp["tasks"]][0] + print(f"Started task: {task_arn}") + + task_id = task_arn.split("/")[-1] + log_group_name = f"/ecs/cinco-arclight-{stack}" + log_stream_name = f"ecs/cinco-arclight-{stack}-container/{task_id}" + print( + "Tail logs with command:\n" + f"aws logs tail {log_group_name} --log-stream-name-prefix " + f"{log_stream_name} --region us-west-2" + ) + + print( + f"Session to the machine with command: (assuming you have a cdl-pad-prd profile)\n" + f"aws ecs execute-command --profile cdl-pad-prd " + f"--cluster arn:aws:ecs:us-west-2:777968769372:cluster/cinco-{env} " + f"--task {task_arn} " + f"--container cinco-arclight-{env}-container " + "--command /bin/bash --interactive" + ) + + +# python arclight_job.py rake +# python arclight_job.py bin/generate_static_pages +# python arclight_job.py sh -c tail -f /dev/null +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run commands on an Arclight ECS instance" + ) + parser.add_argument( + "--prd", action="store_true", help="Use the production environment" + ) + parser.add_argument( + "--latest", action="store_true", help="Force using the latest task definition" + ) + parser.add_argument( + "--task-definition", + type=int, + default=None, + help="Task definition revision to use (default: same as running service)", + ) + parser.add_argument( + "command", nargs=argparse.REMAINDER, help="Command to pass to manage.py" + ) + + args = parser.parse_args() + + stack = "prd" if args.prd else "stage" + latest = True if args.latest else False + + if not args.command: + parser.error("You must provide a command to run.") + + main(stack, args.command, args.task_definition, latest) + # tail_logs = main(stack, args.command, args.task_definition) + # print(f"{bcolors.OKCYAN}[ECS_MANAGE]: {tail_logs}{bcolors.ENDC}") + # os.system(tail_logs) From 793e94a7f066ec766abc351115b96e267c0fafda Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 24 Jul 2025 18:08:45 -0700 Subject: [PATCH 02/10] Add arclight_job docs, build-static-findaids-sketch.md [infra, arclight] --- arclight/bin/build-static-findaids-sketch.md | 45 +++++++++++ infrastructure/cinco/scripts/arclight_job.py | 82 ++++++++++++++++---- 2 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 arclight/bin/build-static-findaids-sketch.md diff --git a/arclight/bin/build-static-findaids-sketch.md b/arclight/bin/build-static-findaids-sketch.md new file mode 100644 index 00000000..65b0e10b --- /dev/null +++ b/arclight/bin/build-static-findaids-sketch.md @@ -0,0 +1,45 @@ + + +# 1. Query for a list of big finding aids using curl + +We have a $SOLR_URL in the container, already defined. + +Here's the query: +``` +$SOLR_URL/select? + fq=total_component_count_is%3A[4501%20TO%20*]& + indent=true& + q.op=OR& + q=level_ssim%3A%22Collection%22& + sort=total_component_count_is%20desc& + rows=300& + fl=total_component_count_is,id +``` + +- add last indexed date to returned fields for use in s3 metadata + +```curl $SOLR_URL...``` + +# 2. Start the rails server + +It's not currently running, since we issued a command override to start +this container, so: + +```/rails/bin/docker-entrypoint ./bin/rails server &``` + +# 3. Get the currently running application version + +This isn't currently anywhere in our image, the codebase isn't even a straight git checkout, and git isn't installed either, so we can't even run a git command. TODO: add this to our image (and our footer, while we're at it) + +# 3. For each item in our solr search results set + +- get the last indexed date +- get the ark + +``` +curl http://0.0.0.0:3000/findaid/static/$ARK -o /tmp/static.html +aws s3 cp /tmp/static.html s3://$S3_BUCKET/static_findaids/$ARK --metadata ArclightVersion=VERSION,LastIndexed=$LAST_INDEXED_DATE +``` + +- throttle requests so we don't overload solr +- stash in $S3_BUCKET/static_findaids/ (/static/ is Django's static files! don't overwrite!) diff --git a/infrastructure/cinco/scripts/arclight_job.py b/infrastructure/cinco/scripts/arclight_job.py index 5e7d5f6c..92935502 100644 --- a/infrastructure/cinco/scripts/arclight_job.py +++ b/infrastructure/cinco/scripts/arclight_job.py @@ -1,9 +1,61 @@ -import argparse -# import time +""" +Requires a python3 environment w/ boto3 installed, cdl-pad-prd profile +configured (as per cdl-ssm-util), and AWS_* environment variables. + +Runs an ephemeral arclight worker container (outside the ALB) in the +cinco-stage cluster (by default). Pass --prd to run an arclight worker +container in the cinco-prd cluster. + +Prints the Task ARN, an aws cli command to tail the logs, an aws cli +command to start an interactive session on the container, and an aws cli +command to stop the container. It does take ~ a minute for the task to +become accessible via `ecs execute-command` (or via the `session` +utility). + +By default, the arclight worker container is defined by the same task +definition as the currently running service definition (ie: if the +cinco-arclight-stage-service is running task definition verion 6, then +this script will run a task using task definition version 6). + +You can also explicitly specify a task definition revision by passing +--task-definition , or use the --latest flag to run the +most recent active task definition revision. (Explicitly specifying +revision number is useful when chained together with task definition +updates in the build pipeline, for example.) + +You must specify a command to run on the ephemeral container. + +usage examples: + +to run a rake command in cinco-stage cluster: + python arclight_job.py rake ... + +to manually index from s3 in cinco-prd cluster: + python arclight_job.py --prd bin/index-from-s3 [--preview]" + +to build static finding aids in cinco-prd cluster: + python arclight_job.py --prd bin/build-static-findaids +to examine and explore the filesystem for an older arclight version: + python arclight_job.py --task-definition 4 sh -c "tail -f /dev/null" +""" + +import argparse import boto3 +class bcolors: + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKCYAN = "\033[96m" + OKGREEN = "\033[92m" + WARNING = "\033[93m" + FAIL = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" + + def task_template(): return { "capacityProviderStrategy": [ @@ -13,6 +65,7 @@ def task_template(): "platformVersion": "LATEST", "enableECSManagedTags": True, "enableExecuteCommand": True, + "startedBy": "arclight_job.py", } @@ -74,7 +127,7 @@ def main( ) network_configuration = get_service_network_config(cluster, env) - print(f"Running `{' '.join(command)}` on Cinco Arclight {env} in ECS") + print(f"Running `{' '.join(command)}` on Cinco Arclight {env} in ECS\n") ecs_client = boto3.client("ecs", region_name="us-west-2") resp = ecs_client.run_task( @@ -93,30 +146,34 @@ def main( }, ) task_arn = [task["taskArn"] for task in resp["tasks"]][0] - print(f"Started task: {task_arn}") + print( + f"{bcolors.HEADER}Started task:{bcolors.ENDC} {bcolors.BOLD}{task_arn}{bcolors.ENDC}\n" + ) task_id = task_arn.split("/")[-1] log_group_name = f"/ecs/cinco-arclight-{stack}" log_stream_name = f"ecs/cinco-arclight-{stack}-container/{task_id}" print( - "Tail logs with command:\n" + f"{bcolors.HEADER}Tail logs with command:{bcolors.ENDC}\n" f"aws logs tail {log_group_name} --log-stream-name-prefix " - f"{log_stream_name} --region us-west-2" + f"{log_stream_name}\n" ) print( - f"Session to the machine with command: (assuming you have a cdl-pad-prd profile)\n" + f"{bcolors.HEADER}Session to the machine with command: (assuming you have a cdl-pad-prd profile){bcolors.ENDC}\n" f"aws ecs execute-command --profile cdl-pad-prd " f"--cluster arn:aws:ecs:us-west-2:777968769372:cluster/cinco-{env} " f"--task {task_arn} " f"--container cinco-arclight-{env}-container " - "--command /bin/bash --interactive" + f"--command /bin/bash --interactive\n" + ) + + print( + f"{bcolors.HEADER}Stop container with command:{bcolors.ENDC}\n" + f"aws ecs stop-task --cluster cinco-{env} --task {task_arn}\n" ) -# python arclight_job.py rake -# python arclight_job.py bin/generate_static_pages -# python arclight_job.py sh -c tail -f /dev/null if __name__ == "__main__": parser = argparse.ArgumentParser( description="Run commands on an Arclight ECS instance" @@ -146,6 +203,3 @@ def main( parser.error("You must provide a command to run.") main(stack, args.command, args.task_definition, latest) - # tail_logs = main(stack, args.command, args.task_definition) - # print(f"{bcolors.OKCYAN}[ECS_MANAGE]: {tail_logs}{bcolors.ENDC}") - # os.system(tail_logs) From e0ad426d117357f81ddcb92657c4f0c9efc2506d Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 11:50:05 -0700 Subject: [PATCH 03/10] Add CINCO_VERSION.txt to arclight & cincoctrl images [infra] [manually deployed, not sceptre deployed] --- infrastructure/cinco/config/prd/arclight/build.yaml | 1 + infrastructure/cinco/config/prd/cincoctrl/build.yaml | 1 + infrastructure/cinco/config/stage/arclight/build.yaml | 1 + infrastructure/cinco/config/stage/cincoctrl/build.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/infrastructure/cinco/config/prd/arclight/build.yaml b/infrastructure/cinco/config/prd/arclight/build.yaml index fc0d495c..d8340289 100644 --- a/infrastructure/cinco/config/prd/arclight/build.yaml +++ b/infrastructure/cinco/config/prd/arclight/build.yaml @@ -29,6 +29,7 @@ sceptre_user_data: - TAG=`git describe --tags --abbrev=0` - REPO="$AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com" - NEW_IMAGE="$REPO/cinco-arclight:$TAG" + - cp ./VERSION.txt arclight/CINCO_VERSION.txt - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $REPO - docker build -t cinco-arclight:$TAG arclight --file arclight/Dockerfile - docker tag cinco-arclight:$TAG $NEW_IMAGE diff --git a/infrastructure/cinco/config/prd/cincoctrl/build.yaml b/infrastructure/cinco/config/prd/cincoctrl/build.yaml index 7e1b9517..04be82e5 100644 --- a/infrastructure/cinco/config/prd/cincoctrl/build.yaml +++ b/infrastructure/cinco/config/prd/cincoctrl/build.yaml @@ -29,6 +29,7 @@ sceptre_user_data: - TAG=`git describe --tags --abbrev=0` - REPO="$AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com" - NEW_IMAGE="$REPO/cinco-ctrl:$TAG" + - cp ./VERSION.txt cincoctrl/CINCO_VERSION.txt - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $REPO - docker build -t cinco-ctrl:$TAG cincoctrl --file cincoctrl/compose/production/django/Dockerfile - docker tag cinco-ctrl:$TAG $NEW_IMAGE diff --git a/infrastructure/cinco/config/stage/arclight/build.yaml b/infrastructure/cinco/config/stage/arclight/build.yaml index 67952932..36955cc8 100644 --- a/infrastructure/cinco/config/stage/arclight/build.yaml +++ b/infrastructure/cinco/config/stage/arclight/build.yaml @@ -26,6 +26,7 @@ sceptre_user_data: phases: build: commands: + - git rev-parse --short HEAD > arclight/CINCO_VERSION.txt - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin ${AWS::AccountId}.dkr.ecr.us-west-2.amazonaws.com - docker build -t cinco-arclight arclight --file arclight/Dockerfile - docker tag cinco-arclight:latest ${AWS::AccountId}.dkr.ecr.us-west-2.amazonaws.com/cinco-arclight:latest diff --git a/infrastructure/cinco/config/stage/cincoctrl/build.yaml b/infrastructure/cinco/config/stage/cincoctrl/build.yaml index ba588cc8..17e4bcf9 100644 --- a/infrastructure/cinco/config/stage/cincoctrl/build.yaml +++ b/infrastructure/cinco/config/stage/cincoctrl/build.yaml @@ -29,6 +29,7 @@ sceptre_user_data: python: 3.12 build: commands: + - git rev-parse --short HEAD > cincoctrl/CINCO_VERSION.txt - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin ${AWS::AccountId}.dkr.ecr.us-west-2.amazonaws.com - docker build -t cinco-ctrl cincoctrl --file cincoctrl/compose/production/django/Dockerfile - docker tag cinco-ctrl:latest ${AWS::AccountId}.dkr.ecr.us-west-2.amazonaws.com/cinco-ctrl:latest From 9da7bd5ba94dcba4260096527c77c5b291c79e30 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 12:10:11 -0700 Subject: [PATCH 04/10] Assume SOLR_URL = solr leader url for arclight_job [infra] --- infrastructure/cinco/scripts/arclight_job.py | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/infrastructure/cinco/scripts/arclight_job.py b/infrastructure/cinco/scripts/arclight_job.py index 92935502..70b9272a 100644 --- a/infrastructure/cinco/scripts/arclight_job.py +++ b/infrastructure/cinco/scripts/arclight_job.py @@ -56,6 +56,27 @@ class bcolors: UNDERLINE = "\033[4m" +def get_stack_outputs(stack_name): + """ + get the outputs of a cloudformation stack + """ + client = boto3.client("cloudformation", region_name="us-west-2") + cf_outputs = ( + client.describe_stacks(StackName=stack_name) + .get("Stacks", [{}])[0] + .get("Outputs", []) + ) + return {output["OutputKey"]: output["OutputValue"] for output in cf_outputs} + + +def get_solr_leader_url(env="stage"): + """ + get the url of the solr leader from the cloudformation stack + """ + outputs = get_stack_outputs(f"cinco-{env}-solr-solr") + return f"http://{outputs['LoadBalancerDNS']}/solr/arclight" + + def task_template(): return { "capacityProviderStrategy": [ @@ -140,6 +161,12 @@ def main( { "name": f"cinco-arclight-{env}-container", "command": [*command], + "environment": [ + { + "name": "SOLR_URL", + "value": get_solr_leader_url(env), + } + ], "memory": 2048, } ] From b46ac2950cdd795f4415d5636746714d2ad8c5e8 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 13:53:44 -0700 Subject: [PATCH 05/10] Add build-static-findaids bash script [arclight] --- arclight/bin/build-static-findaids | 89 ++++++++++++++++++++ arclight/bin/build-static-findaids-sketch.md | 4 +- 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 arclight/bin/build-static-findaids diff --git a/arclight/bin/build-static-findaids b/arclight/bin/build-static-findaids new file mode 100644 index 00000000..8ea95ee5 --- /dev/null +++ b/arclight/bin/build-static-findaids @@ -0,0 +1,89 @@ +#!/bin/bash +# filepath: /Users/awieliczka/Projects/cinco/arclight/bin/build-static-findaids-sketch.sh + +set -euo pipefail + +# Requires jq +echo -e "\nInstalling jq...\n" +apt-get update && apt-get install jq -y + +# 1. Query for a list of big finding aids using curl +# Requires $SOLR_URL and $S3_BUCKET to be set in the environment +SOLR_URL="${SOLR_URL:?SOLR_URL must be set}" +S3_BUCKET="${S3_BUCKET:?S3_BUCKET must be set}" +PAGE_SIZE=5 + +echo -e "\nQuerying Solr for large finding aids..." +results=$(curl -s --get "$SOLR_URL/select" \ + --data-urlencode 'fq=total_component_count_is:[4501 TO *]' \ + --data-urlencode 'indent=true' \ + --data-urlencode 'q.op=OR' \ + --data-urlencode 'q=level_ssim:"Collection"' \ + --data-urlencode 'sort=total_component_count_is desc' \ + --data-urlencode "rows=$PAGE_SIZE" \ + --data-urlencode 'fl=total_component_count_is,id,timestamp') + +# 2. Start the rails server (if not already running) +if ! pgrep -f "rails server" > /dev/null; then + echo -e "\nStarting Rails server..." + /rails/bin/docker-entrypoint ./bin/rails server > /tmp/rails-server.log 2>&1 & + sleep 10 # Give the server time to start +fi + +# 3. Get the currently running application version +if [[ -f CINCO_VERSION.txt ]]; then + VERSION=$(cat CINCO_VERSION.txt) +else + VERSION="unknown" +fi + +# 4. For each ark/last-indexed-date in our solr search results set +NUM_FOUND=$(echo "$results" | jq '.response.numFound') +PAGE_SIZE=$(echo "$results" | jq '.response.docs | length') +echo -e "\nProcessing Solr results... ($NUM_FOUND total results, $PAGE_SIZE per page)" + +START=0 + +while (( START < NUM_FOUND )); do + echo -e "\n===== Fetching Solr results page starting at $START... =====\n" + page_results=$(curl -s --get "$SOLR_URL/select" \ + --data-urlencode "fq=total_component_count_is:[50 TO 51]" \ + --data-urlencode 'indent=true' \ + --data-urlencode 'q.op=OR' \ + --data-urlencode 'q=level_ssim:"Collection"' \ + --data-urlencode 'sort=total_component_count_is desc' \ + --data-urlencode "rows=$PAGE_SIZE" \ + --data-urlencode "start=$START" \ + --data-urlencode 'fl=total_component_count_is,id,timestamp') + + echo "$page_results" | jq -c '.response.docs[]' | while read -r doc; do + ARK=$(echo "$doc" | jq -r '.id') + LAST_INDEXED_DATE=$(echo "$doc" | jq -r '.timestamp // empty') + if [[ -z "$ARK" ]]; then continue; fi + + echo "Fetching static HTML for $ARK..." + ENCODED_ARK=$(printf '%s' "$ARK" | jq -sRr @uri) + curl -s "http://0.0.0.0:3000/findaid/static/$ENCODED_ARK" -o /tmp/static_findaid.html + + if [[ ! -s /tmp/static_findaid.html ]]; then + echo -e "\033[31mFailed to fetch static HTML for $ARK, skipping upload.\033[0m\n" + rm -f /tmp/static_findaid.html + continue + fi + + echo -e "Uploading to S3: $S3_BUCKET/static_findaids/$ARK" + aws s3 cp /tmp/static_findaid.html "s3://$S3_BUCKET/static_findaids/$ARK" \ + --metadata "ArclightVersion=$VERSION,LastIndexed=$LAST_INDEXED_DATE" + + rm -f /tmp/static_findaid.html + + # Throttle requests to avoid overloading Solr + sleep 1 + done + + START=$(( START + PAGE_SIZE )) +done + +echo -e "\n\n===== Rails server log output =====" +cat /tmp/rails-server.log +echo "Done." diff --git a/arclight/bin/build-static-findaids-sketch.md b/arclight/bin/build-static-findaids-sketch.md index 65b0e10b..9392d3bc 100644 --- a/arclight/bin/build-static-findaids-sketch.md +++ b/arclight/bin/build-static-findaids-sketch.md @@ -29,9 +29,9 @@ this container, so: # 3. Get the currently running application version -This isn't currently anywhere in our image, the codebase isn't even a straight git checkout, and git isn't installed either, so we can't even run a git command. TODO: add this to our image (and our footer, while we're at it) +set VERSION = cat CINCO_VERSION.txt -# 3. For each item in our solr search results set +# 3. For each ark/last-indexed-date in our solr search results set - get the last indexed date - get the ark From 559a4ee91b8beab50f4e558ee15da84578dff1e2 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 14:01:14 -0700 Subject: [PATCH 06/10] Build biggest 300 by default, but provide some params [arclight] --- arclight/bin/build-static-findaids | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arclight/bin/build-static-findaids b/arclight/bin/build-static-findaids index 8ea95ee5..5f93c38b 100644 --- a/arclight/bin/build-static-findaids +++ b/arclight/bin/build-static-findaids @@ -3,6 +3,18 @@ set -euo pipefail +# Default values +PAGE_SIZE="${1:-300}" # Default to 100 if not provided +COMPONENT_COUNT_RANGE="${2:-4501 TO *}" + +# Usage info +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + echo "Usage: $0 [PAGE_SIZE] [COMPONENT_COUNT_RANGE]" + echo " PAGE_SIZE: Number of results per Solr page (default: 5)" + echo " COMPONENT_COUNT_RANGE: Solr range query for total_component_count_is (default: '4501 TO *')" + exit 0 +fi + # Requires jq echo -e "\nInstalling jq...\n" apt-get update && apt-get install jq -y @@ -11,11 +23,10 @@ apt-get update && apt-get install jq -y # Requires $SOLR_URL and $S3_BUCKET to be set in the environment SOLR_URL="${SOLR_URL:?SOLR_URL must be set}" S3_BUCKET="${S3_BUCKET:?S3_BUCKET must be set}" -PAGE_SIZE=5 echo -e "\nQuerying Solr for large finding aids..." results=$(curl -s --get "$SOLR_URL/select" \ - --data-urlencode 'fq=total_component_count_is:[4501 TO *]' \ + --data-urlencode "fq=total_component_count_is:[${COMPONENT_COUNT_RANGE}]" \ --data-urlencode 'indent=true' \ --data-urlencode 'q.op=OR' \ --data-urlencode 'q=level_ssim:"Collection"' \ @@ -39,7 +50,6 @@ fi # 4. For each ark/last-indexed-date in our solr search results set NUM_FOUND=$(echo "$results" | jq '.response.numFound') -PAGE_SIZE=$(echo "$results" | jq '.response.docs | length') echo -e "\nProcessing Solr results... ($NUM_FOUND total results, $PAGE_SIZE per page)" START=0 @@ -47,7 +57,7 @@ START=0 while (( START < NUM_FOUND )); do echo -e "\n===== Fetching Solr results page starting at $START... =====\n" page_results=$(curl -s --get "$SOLR_URL/select" \ - --data-urlencode "fq=total_component_count_is:[50 TO 51]" \ + --data-urlencode "fq=total_component_count_is:[${COMPONENT_COUNT_RANGE}]" \ --data-urlencode 'indent=true' \ --data-urlencode 'q.op=OR' \ --data-urlencode 'q=level_ssim:"Collection"' \ From ed653afc26ae14036c14f7a89b2f125a17b3dc64 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 14:03:37 -0700 Subject: [PATCH 07/10] Add --memory and --cpu to arclight_job [infra, arclight] --- infrastructure/cinco/scripts/arclight_job.py | 22 ++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/infrastructure/cinco/scripts/arclight_job.py b/infrastructure/cinco/scripts/arclight_job.py index 70b9272a..3cee64c5 100644 --- a/infrastructure/cinco/scripts/arclight_job.py +++ b/infrastructure/cinco/scripts/arclight_job.py @@ -141,6 +141,8 @@ def main( command: list[str], task_definition_revision: int = None, latest: bool = False, + memory: int = 3072, + cpu: int = 1024, ): cluster = "cinco-prd" if env == "prd" else "cinco-stage" task_definition = get_task_definition( @@ -167,7 +169,8 @@ def main( "value": get_solr_leader_url(env), } ], - "memory": 2048, + "cpu": cpu, + "memory": memory, } ] }, @@ -203,7 +206,7 @@ def main( if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Run commands on an Arclight ECS instance" + description="Run commands on an Arclight ECS instance and Solr Leader URL." ) parser.add_argument( "--prd", action="store_true", help="Use the production environment" @@ -217,10 +220,21 @@ def main( default=None, help="Task definition revision to use (default: same as running service)", ) + parser.add_argument( + "--memory", + type=int, + default=3072, + help="Memory in MiB to allocate for the task (default: 3072 MiB)", + ) + parser.add_argument( + "--cpu", + type=int, + default=1024, + help="CPU units to allocate for the task (default: 1024)", + ) parser.add_argument( "command", nargs=argparse.REMAINDER, help="Command to pass to manage.py" ) - args = parser.parse_args() stack = "prd" if args.prd else "stage" @@ -229,4 +243,4 @@ def main( if not args.command: parser.error("You must provide a command to run.") - main(stack, args.command, args.task_definition, latest) + main(stack, args.command, args.task_definition, latest, args.memory, args.cpu) From ad1a5e5db249342294ddb1d357f18a1db7b228f1 Mon Sep 17 00:00:00 2001 From: Chad Nelson Date: Fri, 25 Jul 2025 14:21:11 -0700 Subject: [PATCH 08/10] Redirect large static guides to the arclight guide page. Lower the threshold for "large" --- arclight/app/controllers/static_finding_aid_controller.rb | 5 ++++- arclight/config/initializers/static_finding_aid.rb | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arclight/app/controllers/static_finding_aid_controller.rb b/arclight/app/controllers/static_finding_aid_controller.rb index 9ec16384..44a09671 100644 --- a/arclight/app/controllers/static_finding_aid_controller.rb +++ b/arclight/app/controllers/static_finding_aid_controller.rb @@ -227,7 +227,10 @@ class StaticFindingAidController < ApplicationController end def show + @document = search_service.fetch(::RSolr.solr_escape(params[:id])) + if !helpers.show_static_finding_aid_link?(@document) + redirect_to solr_document_path(@document), status: 302 + end @doc_tree = Oac::FindingAidTreeNode.new(self, params[:id]) - @document = @doc_tree.document end end diff --git a/arclight/config/initializers/static_finding_aid.rb b/arclight/config/initializers/static_finding_aid.rb index 2f996194..fb65d1c0 100644 --- a/arclight/config/initializers/static_finding_aid.rb +++ b/arclight/config/initializers/static_finding_aid.rb @@ -2,7 +2,7 @@ # too many to display a static finding aid. # # -Rails.application.config.child_component_limit = ENV["CHILD_COMPONENT_LIMIT"] || 4500 +Rails.application.config.child_component_limit = ENV["CHILD_COMPONENT_LIMIT"] || 1700 # Rails.application.config.disallowed_static_guides = [ From d1fe4b18c4cc6d2e4297ef1802aa5f2b62e290cb Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 16:01:50 -0700 Subject: [PATCH 09/10] Add option to turn off static finding aid page caching [arclight] --- arclight/bin/build-static-findaids | 9 +++++++-- arclight/config/initializers/static_finding_aid.rb | 7 +++++++ arclight/lib/oac/finding_aid_tree_node.rb | 6 +++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arclight/bin/build-static-findaids b/arclight/bin/build-static-findaids index 5f93c38b..2d78c87c 100644 --- a/arclight/bin/build-static-findaids +++ b/arclight/bin/build-static-findaids @@ -4,8 +4,8 @@ set -euo pipefail # Default values -PAGE_SIZE="${1:-300}" # Default to 100 if not provided -COMPONENT_COUNT_RANGE="${2:-4501 TO *}" +PAGE_SIZE="${1:-5}" # Default to 100 if not provided +COMPONENT_COUNT_RANGE="${2:-10001 TO 15000}" # Usage info if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then @@ -24,6 +24,11 @@ apt-get update && apt-get install jq -y SOLR_URL="${SOLR_URL:?SOLR_URL must be set}" S3_BUCKET="${S3_BUCKET:?S3_BUCKET must be set}" +# overwrite the CHILD_COMPONENT_LIMIT, set to 1700 by default +echo -e "\nSetting CHILD_COMPONENT_LIMIT to 1700..." +export CHILD_COMPONENT_LIMIT=15000 +export ENABLE_STATIC_GUIDE_CACHE=false + echo -e "\nQuerying Solr for large finding aids..." results=$(curl -s --get "$SOLR_URL/select" \ --data-urlencode "fq=total_component_count_is:[${COMPONENT_COUNT_RANGE}]" \ diff --git a/arclight/config/initializers/static_finding_aid.rb b/arclight/config/initializers/static_finding_aid.rb index fb65d1c0..171a2fef 100644 --- a/arclight/config/initializers/static_finding_aid.rb +++ b/arclight/config/initializers/static_finding_aid.rb @@ -5,6 +5,13 @@ Rails.application.config.child_component_limit = ENV["CHILD_COMPONENT_LIMIT"] || 1700 # +enable_static_guide_cache = ENV["ENABLE_STATIC_GUIDE_CACHE"] || true +if enable_static_guide_cache == "false" + Rails.application.config.enable_static_guide_cache = false +else + Rails.application.config.enable_static_guide_cache = true +end + Rails.application.config.disallowed_static_guides = [ "ark:/13030/c8tt4pp0" ] diff --git a/arclight/lib/oac/finding_aid_tree_node.rb b/arclight/lib/oac/finding_aid_tree_node.rb index dde2dd2f..56366e98 100644 --- a/arclight/lib/oac/finding_aid_tree_node.rb +++ b/arclight/lib/oac/finding_aid_tree_node.rb @@ -13,7 +13,11 @@ def initialize(controller, id, has_children: true) end def children - Rails.cache.fetch("#{@document.id}/children") do + if Rails.config.enable_static_guide_cache + Rails.cache.fetch("#{@document.id}/children") do + _get_children + end + else _get_children end end From 0da1c94304348ac5012a8f04e3cb264f7e90ffa0 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 25 Jul 2025 16:04:19 -0700 Subject: [PATCH 10/10] Should serve static findaids from s3 if found [arclight] --- .../static_finding_aid_controller.rb | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arclight/app/controllers/static_finding_aid_controller.rb b/arclight/app/controllers/static_finding_aid_controller.rb index 44a09671..cc320007 100644 --- a/arclight/app/controllers/static_finding_aid_controller.rb +++ b/arclight/app/controllers/static_finding_aid_controller.rb @@ -228,6 +228,28 @@ class StaticFindingAidController < ApplicationController def show @document = search_service.fetch(::RSolr.solr_escape(params[:id])) + + # get document's last indexed date and id + doc_id = @document.id + last_indexed = @document["timestamp"] + s3_key = "static_findaids/#{doc_id}" + + s3 = Aws::S3::Client.new(region: "us-west-2") + bucket = ENV.fetch("S3_BUCKET") + + begin + head = s3.head_object(bucket: bucket, key: s3_key) + s3_last_indexed = head.metadata["lastindexed"] + + if s3_last_indexed == last_indexed + html_obj = s3.get_object(bucket: bucket, key: s3_key) + send_data html_obj.body.read, type: "text/html", disposition: "inline" + return + end + rescue Aws::S3::Errors::NotFound + # object does not exist, continue as normal + end + if !helpers.show_static_finding_aid_link?(@document) redirect_to solr_document_path(@document), status: 302 end