From dafad60e976ff8716d97e6085630c76a902e916a Mon Sep 17 00:00:00 2001 From: 1597463007 Date: Mon, 25 Aug 2025 16:56:35 -0400 Subject: [PATCH 1/4] Add Sphinx documentation and GitHub Pages workflow Introduces Sphinx-based documentation for the project, including configuration, requirements, and initial content for adapter and protocol documentation. Adds a GitHub Actions workflow to build and publish the documentation to GitHub Pages on release or manual trigger. Also includes .gitignore updates to exclude build artifacts and IDE files. --- .github/workflows/documentation.yml | 46 +++++++++++++++ .gitignore | 3 + docs/Makefile | 20 +++++++ docs/make.bat | 35 ++++++++++++ docs/requirements_docs.txt | 3 + docs/source/_static/schema.yaml | 86 +++++++++++++++++++++++++++++ docs/source/adapter/scaling.rst | 43 +++++++++++++++ docs/source/adapter/webhook.rst | 40 ++++++++++++++ docs/source/conf.py | 26 +++++++++ docs/source/index.rst | 35 ++++++++++++ docs/source/protocol/scheduler.rst | 4 ++ 11 files changed, 341 insertions(+) create mode 100644 .github/workflows/documentation.yml create mode 100644 .gitignore create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/requirements_docs.txt create mode 100644 docs/source/_static/schema.yaml create mode 100644 docs/source/adapter/scaling.rst create mode 100644 docs/source/adapter/webhook.rst create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 docs/source/protocol/scheduler.rst diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000..e6b0c98 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,46 @@ +name: Publish documentation to GitHub Pages + +on: + release: + types: [ created ] + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 + with: + egress-policy: audit + + - name: Checkout code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python 3.10 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install uv + uv pip install --system -r docs/requirements_docs.txt + + - name: Build documentation + run: cd docs && make html + + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1 + with: + path: './docs/build/html' + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..64e0691 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea/ + +docs/build/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..dc1312a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt new file mode 100644 index 0000000..30fd621 --- /dev/null +++ b/docs/requirements_docs.txt @@ -0,0 +1,3 @@ +sphinx +sphinx_rtd_theme +sphinxcontrib-openapi diff --git a/docs/source/_static/schema.yaml b/docs/source/_static/schema.yaml new file mode 100644 index 0000000..4d41654 --- /dev/null +++ b/docs/source/_static/schema.yaml @@ -0,0 +1,86 @@ +swagger: "2.0" +info: + title: OpenGRIS Adapter Webhook OpenAPI Schema + version: "1.0.0" +paths: + /: + post: + summary: Webhook endpoint for OpenGRIS Adapter + parameters: + - in: body + name: body + required: true + schema: + type: object + properties: + action: + description: Action to perform. Valid values are 'start_worker' and 'shutdown_worker'. + type: string + enum: + - start_worker + - shutdown_worker + worker_id: + description: | + Unique identifier for the worker instance. **Note**: This ID might be ignored when performing the + 'start_worker` action, as the server may generate its own ID instead. + type: string + metadata: + description: | + Optional metadata provided by the scheduler's scaling policy. May include details like instance type, + region, image tags, etc. + type: object + auth_token: + description: | + Optional authentication token for securing the webhook from unauthorized requests. Must match the + server's expected token if provided. + type: string + required: + - action + - worker_id + responses: + 200: + description: Successful response + schema: + type: object + properties: + status: + description: Status of the requested action. + type: string + worker_id: + description: | + Unique identifier for the worker instance. **Note**: This ID may not match the one provided in the + request if the server generates its own ID when starting a new worker. + type: string + metadata: + description: | + Optional metadata about the worker instance. May include details like instance type, + region, image tags, etc. + type: object + properties: { } + required: + - status + - worker_id + 400: + description: Bad Request, e.g., missing required fields or invalid action + schema: + $ref: '#/definitions/Error' + 401: + description: Unauthorized, e.g., invalid or missing auth_token + schema: + $ref: '#/definitions/Error' + 429: + description: Too Many Requests, e.g. rate limiting, no available resources + schema: + $ref: '#/definitions/Error' + 500: + description: Internal Server Error, e.g., server-side or infrastructure related issues + schema: + $ref: '#/definitions/Error' +definitions: + Error: + type: object + properties: + error: + type: string + required: + - error diff --git a/docs/source/adapter/scaling.rst b/docs/source/adapter/scaling.rst new file mode 100644 index 0000000..2bc13de --- /dev/null +++ b/docs/source/adapter/scaling.rst @@ -0,0 +1,43 @@ +Scaling Controller +================== + +The scaling controller determines when to start and stop workers based on task and other state updates emitted by +the scheduler. The scaling controller implements a scaling policy that can suit specific needs such as: + +- Cost optimized: Start and stop workers based on task queue length and worker idle time. +- Performance optimized: Start workers as soon as tasks are available. +- Time constrained: Start workers based on a schedule. +- Resource constrained: Start and stop workers based on available budget or resource quotas. + +To perform a scaling action, the scaling controller sends a webhook request to the adapter. The adapter then starts or +stops workers as requested. For more details on the adapter, see the `Adapter Webhook API `_. + +Scaling Controller Interface +---------------------------- + +.. code-block:: python + + class ScalingController(Reporter, abc.ABC): + async def on_state_client(self, state_client: StateClient): + pass + + async def on_state_object(self, state_object: StateObject): + pass + + async def on_state_balance_advice(self, state_balance_advice: StateBalanceAdvice): + pass + + async def on_state_scheduler(self, state_scheduler: StateScheduler): + pass + + async def on_state_worker(self, state_worker: StateWorker): + pass + + async def on_state_task(self, state_task: StateTask): + pass + + async def on_state_graph_task(self, state_graph_task: StateGraphTask): + pass + +In most cases, the scaling controller will only need to implement the ``on_state_task`` and ``on_state_worker`` +handlers. The other handlers are provided for completeness and future use cases. diff --git a/docs/source/adapter/webhook.rst b/docs/source/adapter/webhook.rst new file mode 100644 index 0000000..0d18943 --- /dev/null +++ b/docs/source/adapter/webhook.rst @@ -0,0 +1,40 @@ +Adapter Webhook API +=================== + +The Adapter Webhook API provides a single unified endpoint for the scheduler to request the start and stop of workers. + +Here is a high-level diagram illustrating how the Adapter Webhook API fits into the overall architecture: + +.. code-block:: + + Start ┌────────┐ + ┌──────────► Worker │ + │ └────────┘ + │ + ┌───────────┐ Webhook Requests ┌───────┴─┐ Start ┌────────┐ + │ Scaling ├───────────────────► Adapter ├────────► Worker │ + │ Policy │ └───────┬─┘ └────────┘ + └─────▲─────┘ │ + │ │ Start ┌────────┐ + │ Task Updates └──────────► Worker │ + │ └────────┘ + ┌─────┴─────┐ + │ Scheduler │ + └───────────┘ + +Webhook Specification +--------------------- + +.. openapi:: ../_static/schema.yaml + +Scaling Controller Interaction +------------------------------ + +Interactions with the scaling controller can be complex given the possbility of dependency failures, network issues, +delays, etc. Here are some guidelines to ensure the scaling controller can communicate with the adapter reliably: + +- **Return the correct error code**: The scaling controller relies on the adapter to return the correct HTTP status code + to indicate success or failure of the request and whether the request should be retried. +- **Return when the action is finished**: The scaling controller relies on the adapter to return a response only when + the requested action is finished. This ensures that the scaling controller has an accurate view of the current + state of workers. diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..8777a64 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,26 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "OpenGRIS" +copyright = "2025, Citi" +author = "Citi" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinxcontrib.openapi"] + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..4281a38 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,35 @@ +OpenGRIS: Open Standard for Grid Resource Scheduling +==================================================== + +OpenGRIS is an open standard for grid resource scheduling. It provides a standardized way to prepare, distribute, and +execute tasks across elastic and heterogeneous computing environments. + +OpenGRIS contains the following components: + +- Scheduler: Manages tasks using a stable and language-agnostic communication protocol. + - Reference scheduler implementation: `OpenGRIS Scaler `_. +- Worker: Executes tasks assigned by the scheduler. + - Reference worker implementation is part of OpenGRIS Scaler. + - IBM Spectrum Symphony worker implementation: + `Worker Code `_. +- Adapter: Interfaces with the scheduler to start and stop workers. + - Reference adapter implementation for VMs: + `VM Adapter Pull Request `_. + - Documentation: `Adapter Webhook API `_. +- Client Libraries: Helper libraries for implicit parallelization. + - Python client library is part of OpenGRIS Scaler. + - Graph parallelization library: `OpenGRIS Pargraph `_. + - Map-reduce parallelization library: `OpenGRIS Parfun `_. +- Object Storage: For sharing data objects between clients, schedulers, and workers. + - Reference object storage implementation: + `OpenGRIS Object Storage `_. + +Contents +======== + +.. toctree:: + :maxdepth: 2 + + protocol/scheduler + adapter/scaling + adapter/webhook diff --git a/docs/source/protocol/scheduler.rst b/docs/source/protocol/scheduler.rst new file mode 100644 index 0000000..4251df3 --- /dev/null +++ b/docs/source/protocol/scheduler.rst @@ -0,0 +1,4 @@ +Scheduler Communication Protocol +================================ + +TODO \ No newline at end of file From ecff34c21184e7a8c5623a4547c2914686bc0d6a Mon Sep 17 00:00:00 2001 From: 1597463007 Date: Mon, 25 Aug 2025 17:09:30 -0400 Subject: [PATCH 2/4] Add missing imports to scaling controller interface --- docs/source/adapter/scaling.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/adapter/scaling.rst b/docs/source/adapter/scaling.rst index 2bc13de..9b5adff 100644 --- a/docs/source/adapter/scaling.rst +++ b/docs/source/adapter/scaling.rst @@ -17,6 +17,19 @@ Scaling Controller Interface .. code-block:: python + import abc + + from scaler.protocol.python.message import ( + StateBalanceAdvice, + StateClient, + StateGraphTask, + StateObject, + StateScheduler, + StateTask, + StateWorker + ) + from scaler.utility.mixins import Reporter + class ScalingController(Reporter, abc.ABC): async def on_state_client(self, state_client: StateClient): pass From af08fa61a9e2eafed2ed76f50d3ffdeb2ad02c0f Mon Sep 17 00:00:00 2001 From: 1597463007 Date: Fri, 29 Aug 2025 09:34:36 -0400 Subject: [PATCH 3/4] Expand scaling and webhook adapter documentation --- docs/source/adapter/scaling.rst | 22 +++++- docs/source/adapter/webhook.rst | 127 +++++++++++++++++++++++++------- docs/source/index.rst | 23 ++++++ 3 files changed, 145 insertions(+), 27 deletions(-) diff --git a/docs/source/adapter/scaling.rst b/docs/source/adapter/scaling.rst index 9b5adff..96d99f3 100644 --- a/docs/source/adapter/scaling.rst +++ b/docs/source/adapter/scaling.rst @@ -10,7 +10,7 @@ the scheduler. The scaling controller implements a scaling policy that can suit - Resource constrained: Start and stop workers based on available budget or resource quotas. To perform a scaling action, the scaling controller sends a webhook request to the adapter. The adapter then starts or -stops workers as requested. For more details on the adapter, see the `Adapter Webhook API `_. +stops workers as requested. Scaling Controller Interface ---------------------------- @@ -54,3 +54,23 @@ Scaling Controller Interface In most cases, the scaling controller will only need to implement the ``on_state_task`` and ``on_state_worker`` handlers. The other handlers are provided for completeness and future use cases. + +Implementing a Scaling Controller +--------------------------------- + +Some pointers to implement a robust Scaling Controller: + +- Receive scheduler state updates via the provided on_state_* handlers and keep a small internal model of: + - current running workers and their states + - pending tasks and backlog size + - recent failures or rate-limit signals from the adapter +- Decide when to scale up: + - if backlog > threshold OR average task wait time > latency_target, issue a start_worker webhook + - include suggested instance type, region, and any labels in metadata +- Decide when to scale down: + - if backlog is small or task wait time is low, issue a shutdown_worker webhook for specific worker_id +- Validate webhook responses: only treat action as completed when adapter returns 200 and the returned worker_id is recorded +- Use exponential backoff for retries when receiving 429 or 5xx responses from the adapter +- Idempotency: you may include a request identifier in metadata if your adapter supports it, or detect repeated requests + and avoid duplicate actions. +- Observability: emit metrics for actions issued, successful starts/stops, failures, and retries diff --git a/docs/source/adapter/webhook.rst b/docs/source/adapter/webhook.rst index 0d18943..2eb8180 100644 --- a/docs/source/adapter/webhook.rst +++ b/docs/source/adapter/webhook.rst @@ -3,38 +3,113 @@ Adapter Webhook API The Adapter Webhook API provides a single unified endpoint for the scheduler to request the start and stop of workers. -Here is a high-level diagram illustrating how the Adapter Webhook API fits into the overall architecture: - -.. code-block:: - - Start ┌────────┐ - ┌──────────► Worker │ - │ └────────┘ - │ - ┌───────────┐ Webhook Requests ┌───────┴─┐ Start ┌────────┐ - │ Scaling ├───────────────────► Adapter ├────────► Worker │ - │ Policy │ └───────┬─┘ └────────┘ - └─────▲─────┘ │ - │ │ Start ┌────────┐ - │ Task Updates └──────────► Worker │ - │ └────────┘ - ┌─────┴─────┐ - │ Scheduler │ - └───────────┘ - Webhook Specification --------------------- .. openapi:: ../_static/schema.yaml +Webhook Request Examples +------------------------ + +Example: start_worker (minimal) + +.. code-block:: json + + { + "action": "start_worker", + "worker_id": "" + } + +Example: start_worker (with suggested worker_id and auth) + +.. code-block:: json + + { + "action": "start_worker", + "worker_id": "worker-12345", + "metadata": { + "instance_type": "m8gd.16xlarge", + "region": "us-east-1", + "image_tag": "python:3.11-alpine", + "tags": ["python"] + }, + "auth_token": "" + } + +Example: shutdown_worker + +.. code-block:: json + + { + "action": "shutdown_worker", + "worker_id": "worker-12345" + } + +Webhook Reponse Examples +------------------------ + +Success (200) + +.. code-block:: json + + { + "status": "ok", + "worker_id": "worker-12345", + "metadata": { + "instance_type": "m8gd.16xlarge", + "region": "us-east-1" + } + } + +Client error (400) + +.. code-block:: json + + { + "error": "missing required field 'action'" + } + +Unauthorized (401) + +.. code-block:: json + + { + "error": "invalid auth_token" + } + +Rate limit / resource unavailable (429) + +.. code-block:: json + + { + "error": "no available capacity, retry later" + } + +Server error (500) + +.. code-block:: json + + { + "error": "internal provisioning error" + } + Scaling Controller Interaction ------------------------------ -Interactions with the scaling controller can be complex given the possbility of dependency failures, network issues, -delays, etc. Here are some guidelines to ensure the scaling controller can communicate with the adapter reliably: +Interactions with the scaling controller can be complex given the possibility of dependency failures, network issues, +delays, etc. Here are some guidelines to ensure reliable communication between the scaling controller and the adapter: -- **Return the correct error code**: The scaling controller relies on the adapter to return the correct HTTP status code - to indicate success or failure of the request and whether the request should be retried. -- **Return when the action is finished**: The scaling controller relies on the adapter to return a response only when - the requested action is finished. This ensures that the scaling controller has an accurate view of the current - state of workers. +- **Return the correct error code**: Return an appropriate HTTP status to indicate how the controller should proceed: + - 200: action completed successfully (include resulting worker_id) + - 400: client error — fix the request (do not retry) + - 401: unauthorized — reject immediately + - 429: retryable (rate limiting or temporary lack of capacity) — controller should back off and retry + - 5xx: server error — controller should retry with backoff +- **Blocking vs asynchronous actions**: The adapter SHOULD complete the requested action before returning 200. If the + adapter cannot complete the action synchronously, return 202 Accepted and provide a status endpoint or include a + request_id in the response so the controller can poll for completion. +- **Idempotency**: Support idempotent requests. If a controller retries the same start/stop request, the adapter should + detect duplicates (eg. via a request id in metadata) and avoid double-provisioning. +- **Validation**: Validate incoming payloads and return 400 for malformed requests. Include a helpful error message in the + body following the Error schema. +- **Security**: Require an auth_token or use mutual TLS. Reject invalid auth with 401. diff --git a/docs/source/index.rst b/docs/source/index.rst index 4281a38..388909d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,9 +4,32 @@ OpenGRIS: Open Standard for Grid Resource Scheduling OpenGRIS is an open standard for grid resource scheduling. It provides a standardized way to prepare, distribute, and execute tasks across elastic and heterogeneous computing environments. +.. https://asciiflow.com/#/share/eJytVLFuwjAQ%2FRXkmaVVh8JW8QEdaMWSxQQLKK6NHFOBEFKFOnZgiGiGjhk7VZ0qviZf0tgVJol9TlphWcLBd%2B%2B9e3fJGjH8SFCXLShtI4pXRKAuWgfoiYhoylmAupftAC3z387VRX5aqX%2Bu1UmSpcwfAtSqrix%2BzeJn795ZSfX5KicImCN82w8xnbLx70NrQIYTzmeRB3X%2FnQcOuJgRkcXvhgGA73EmBae0FAzWBZe%2F0%2FJ0SOy6TzSbW4QzoZJsgo8idZk3IzyXJ%2Bmp32IjwNMFQ7T%2FtM3yqk3Kd%2B42pLmAAnvao1PCZNn6XYnQltDMP2O4ToKDCqDqdIejWet%2BPsKSRABhSc%2BJALTUff8Feg9U6e9q0WOXqQfTk344IaOFHvhtldQ%2F37UvWdrUbuv%2BAxr4Qqhn4htNgL9BsPU1M789R3sSrwBAyv%2BbYOD2By%2Bdr%2FyyvVYOPEgN1xG89utobe3L7fCBhFKhZG8vf4Y4z05c33slqS%2B5wGNS1%2B3iAqDgyWwMjDZo8wNBgNAj) + +.. code-block:: + + ┌──────────┐ ┌──────┐ + │Scaling │ Webhooks ┌───►│Worker├──┐ + │Controller├─────────┐ ┌───────┐ │ └──────┘ │ + └──────────┘ ├──►│Adapter├─┤ ┌──────┐ │ + ┌──────┐ ▲ │ └───────┘ └───►│Worker├──┤ + ┌──┤Client├───┐ │ │ └──────┘ │ + │ └──────┘ │ │ Task Updates │ │ + │ ┌──────┐ │ ┌────┴────┐ │ ┌──────┐ │ + ├──┤Client├───┼─►│Scheduler│ │ ┌───────┐ ┌───►│Worker├──┤ + │ └──────┘ │ └────┬────┘ └──►│Adapter├─┤ └──────┘ │ + │ ┌──────┐ │ │ └───────┘ │ ┌──────┐ │ + ├──┤Client├───┘ │ └───►│Worker├──┤ + │ └──────┘ ▼ └──────┘ │ + │ ┌───────┐ │ + └────────────────►│Object │◄───────────────────────────────────────┘ + │Storage│ + └───────┘ + OpenGRIS contains the following components: - Scheduler: Manages tasks using a stable and language-agnostic communication protocol. + - Includes the Scaling Controller which issues webhook requests to the Adapters. - Reference scheduler implementation: `OpenGRIS Scaler `_. - Worker: Executes tasks assigned by the scheduler. - Reference worker implementation is part of OpenGRIS Scaler. From ce152a1a8bc2b01be3efb00515dc5115fa2a7dfa Mon Sep 17 00:00:00 2001 From: 1597463007 Date: Fri, 12 Sep 2025 13:06:20 -0400 Subject: [PATCH 4/4] Update scaling docs to use worker groups Replaces references to individual workers with worker groups in schema.yaml and documentation. Updates webhook request and response examples, handler method names, and scaling controller implementation notes to reflect the new worker group model. --- docs/source/_static/schema.yaml | 26 +++++++++++--------- docs/source/adapter/scaling.rst | 42 +++++---------------------------- docs/source/adapter/webhook.rst | 18 +++++++------- 3 files changed, 30 insertions(+), 56 deletions(-) diff --git a/docs/source/_static/schema.yaml b/docs/source/_static/schema.yaml index 4d41654..685c124 100644 --- a/docs/source/_static/schema.yaml +++ b/docs/source/_static/schema.yaml @@ -14,15 +14,15 @@ paths: type: object properties: action: - description: Action to perform. Valid values are 'start_worker' and 'shutdown_worker'. + description: Action to perform. Valid values are 'start_worker_group' and 'shutdown_worker_group'. type: string enum: - - start_worker - - shutdown_worker - worker_id: + - start_worker_group + - shutdown_worker_group + worker_group_id: description: | - Unique identifier for the worker instance. **Note**: This ID might be ignored when performing the - 'start_worker` action, as the server may generate its own ID instead. + Unique identifier for the worker group. **Note**: This ID might be ignored when performing + the 'start_worker_group` action, as the server may generate its own ID instead. type: string metadata: description: | @@ -36,7 +36,6 @@ paths: type: string required: - action - - worker_id responses: 200: description: Successful response @@ -46,11 +45,17 @@ paths: status: description: Status of the requested action. type: string - worker_id: + worker_group_id: description: | - Unique identifier for the worker instance. **Note**: This ID may not match the one provided in the - request if the server generates its own ID when starting a new worker. + Unique identifier for the worker group. **Note**: This ID may not match the one provided in + the request if the server generates its own ID when starting a new worker. type: string + worker_ids: + description: | + List of unique identifiers for the worker instances within the group. + type: array + items: + type: string metadata: description: | Optional metadata about the worker instance. May include details like instance type, @@ -59,7 +64,6 @@ paths: properties: { } required: - status - - worker_id 400: description: Bad Request, e.g., missing required fields or invalid action schema: diff --git a/docs/source/adapter/scaling.rst b/docs/source/adapter/scaling.rst index 96d99f3..246c084 100644 --- a/docs/source/adapter/scaling.rst +++ b/docs/source/adapter/scaling.rst @@ -19,57 +19,27 @@ Scaling Controller Interface import abc - from scaler.protocol.python.message import ( - StateBalanceAdvice, - StateClient, - StateGraphTask, - StateObject, - StateScheduler, - StateTask, - StateWorker - ) + from scaler.protocol.python.message import InformationSnapshot from scaler.utility.mixins import Reporter class ScalingController(Reporter, abc.ABC): - async def on_state_client(self, state_client: StateClient): + def get_status(self): pass - async def on_state_object(self, state_object: StateObject): + async def on_snapshot(self, information_snapshot: InformationSnapshot): pass - - async def on_state_balance_advice(self, state_balance_advice: StateBalanceAdvice): - pass - - async def on_state_scheduler(self, state_scheduler: StateScheduler): - pass - - async def on_state_worker(self, state_worker: StateWorker): - pass - - async def on_state_task(self, state_task: StateTask): - pass - - async def on_state_graph_task(self, state_graph_task: StateGraphTask): - pass - -In most cases, the scaling controller will only need to implement the ``on_state_task`` and ``on_state_worker`` -handlers. The other handlers are provided for completeness and future use cases. Implementing a Scaling Controller --------------------------------- Some pointers to implement a robust Scaling Controller: -- Receive scheduler state updates via the provided on_state_* handlers and keep a small internal model of: - - current running workers and their states - - pending tasks and backlog size - - recent failures or rate-limit signals from the adapter - Decide when to scale up: - - if backlog > threshold OR average task wait time > latency_target, issue a start_worker webhook + - if backlog > threshold OR average task wait time > latency_target, issue a start_worker_group webhook - include suggested instance type, region, and any labels in metadata - Decide when to scale down: - - if backlog is small or task wait time is low, issue a shutdown_worker webhook for specific worker_id -- Validate webhook responses: only treat action as completed when adapter returns 200 and the returned worker_id is recorded + - if backlog is small or task wait time is low, issue a shutdown_worker_group webhook for specific worker_group_id +- Validate webhook responses: only treat action as completed when adapter returns 200 - Use exponential backoff for retries when receiving 429 or 5xx responses from the adapter - Idempotency: you may include a request identifier in metadata if your adapter supports it, or detect repeated requests and avoid duplicate actions. diff --git a/docs/source/adapter/webhook.rst b/docs/source/adapter/webhook.rst index 2eb8180..c4a854e 100644 --- a/docs/source/adapter/webhook.rst +++ b/docs/source/adapter/webhook.rst @@ -11,22 +11,21 @@ Webhook Specification Webhook Request Examples ------------------------ -Example: start_worker (minimal) +Example: start_worker_group (minimal) .. code-block:: json { - "action": "start_worker", - "worker_id": "" + "action": "start_worker_group" } -Example: start_worker (with suggested worker_id and auth) +Example: start_worker_group (with suggested worker_group_id and auth) .. code-block:: json { - "action": "start_worker", - "worker_id": "worker-12345", + "action": "start_worker_group", + "worker_group_id": "worker-group-12345", "metadata": { "instance_type": "m8gd.16xlarge", "region": "us-east-1", @@ -41,8 +40,8 @@ Example: shutdown_worker .. code-block:: json { - "action": "shutdown_worker", - "worker_id": "worker-12345" + "action": "shutdown_worker_group", + "worker_group_id": "worker-group-12345" } Webhook Reponse Examples @@ -54,7 +53,8 @@ Success (200) { "status": "ok", - "worker_id": "worker-12345", + "worker_group_id": "worker-group-12345", + "worker_ids": ["worker-1", "worker-2", "worker-3"], "metadata": { "instance_type": "m8gd.16xlarge", "region": "us-east-1"