diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000..e6b0c98 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,46 @@ +name: Publish documentation to GitHub Pages + +on: + release: + types: [ created ] + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 + with: + egress-policy: audit + + - name: Checkout code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python 3.10 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install uv + uv pip install --system -r docs/requirements_docs.txt + + - name: Build documentation + run: cd docs && make html + + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1 + with: + path: './docs/build/html' + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..64e0691 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea/ + +docs/build/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..dc1312a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt new file mode 100644 index 0000000..30fd621 --- /dev/null +++ b/docs/requirements_docs.txt @@ -0,0 +1,3 @@ +sphinx +sphinx_rtd_theme +sphinxcontrib-openapi diff --git a/docs/source/_static/schema.yaml b/docs/source/_static/schema.yaml new file mode 100644 index 0000000..685c124 --- /dev/null +++ b/docs/source/_static/schema.yaml @@ -0,0 +1,90 @@ +swagger: "2.0" +info: + title: OpenGRIS Adapter Webhook OpenAPI Schema + version: "1.0.0" +paths: + /: + post: + summary: Webhook endpoint for OpenGRIS Adapter + parameters: + - in: body + name: body + required: true + schema: + type: object + properties: + action: + description: Action to perform. Valid values are 'start_worker_group' and 'shutdown_worker_group'. + type: string + enum: + - start_worker_group + - shutdown_worker_group + worker_group_id: + description: | + Unique identifier for the worker group. **Note**: This ID might be ignored when performing + the 'start_worker_group` action, as the server may generate its own ID instead. + type: string + metadata: + description: | + Optional metadata provided by the scheduler's scaling policy. May include details like instance type, + region, image tags, etc. + type: object + auth_token: + description: | + Optional authentication token for securing the webhook from unauthorized requests. Must match the + server's expected token if provided. + type: string + required: + - action + responses: + 200: + description: Successful response + schema: + type: object + properties: + status: + description: Status of the requested action. + type: string + worker_group_id: + description: | + Unique identifier for the worker group. **Note**: This ID may not match the one provided in + the request if the server generates its own ID when starting a new worker. + type: string + worker_ids: + description: | + List of unique identifiers for the worker instances within the group. + type: array + items: + type: string + metadata: + description: | + Optional metadata about the worker instance. May include details like instance type, + region, image tags, etc. + type: object + properties: { } + required: + - status + 400: + description: Bad Request, e.g., missing required fields or invalid action + schema: + $ref: '#/definitions/Error' + 401: + description: Unauthorized, e.g., invalid or missing auth_token + schema: + $ref: '#/definitions/Error' + 429: + description: Too Many Requests, e.g. rate limiting, no available resources + schema: + $ref: '#/definitions/Error' + 500: + description: Internal Server Error, e.g., server-side or infrastructure related issues + schema: + $ref: '#/definitions/Error' +definitions: + Error: + type: object + properties: + error: + type: string + required: + - error diff --git a/docs/source/adapter/scaling.rst b/docs/source/adapter/scaling.rst new file mode 100644 index 0000000..246c084 --- /dev/null +++ b/docs/source/adapter/scaling.rst @@ -0,0 +1,46 @@ +Scaling Controller +================== + +The scaling controller determines when to start and stop workers based on task and other state updates emitted by +the scheduler. The scaling controller implements a scaling policy that can suit specific needs such as: + +- Cost optimized: Start and stop workers based on task queue length and worker idle time. +- Performance optimized: Start workers as soon as tasks are available. +- Time constrained: Start workers based on a schedule. +- Resource constrained: Start and stop workers based on available budget or resource quotas. + +To perform a scaling action, the scaling controller sends a webhook request to the adapter. The adapter then starts or +stops workers as requested. + +Scaling Controller Interface +---------------------------- + +.. code-block:: python + + import abc + + from scaler.protocol.python.message import InformationSnapshot + from scaler.utility.mixins import Reporter + + class ScalingController(Reporter, abc.ABC): + def get_status(self): + pass + + async def on_snapshot(self, information_snapshot: InformationSnapshot): + pass + +Implementing a Scaling Controller +--------------------------------- + +Some pointers to implement a robust Scaling Controller: + +- Decide when to scale up: + - if backlog > threshold OR average task wait time > latency_target, issue a start_worker_group webhook + - include suggested instance type, region, and any labels in metadata +- Decide when to scale down: + - if backlog is small or task wait time is low, issue a shutdown_worker_group webhook for specific worker_group_id +- Validate webhook responses: only treat action as completed when adapter returns 200 +- Use exponential backoff for retries when receiving 429 or 5xx responses from the adapter +- Idempotency: you may include a request identifier in metadata if your adapter supports it, or detect repeated requests + and avoid duplicate actions. +- Observability: emit metrics for actions issued, successful starts/stops, failures, and retries diff --git a/docs/source/adapter/webhook.rst b/docs/source/adapter/webhook.rst new file mode 100644 index 0000000..c4a854e --- /dev/null +++ b/docs/source/adapter/webhook.rst @@ -0,0 +1,115 @@ +Adapter Webhook API +=================== + +The Adapter Webhook API provides a single unified endpoint for the scheduler to request the start and stop of workers. + +Webhook Specification +--------------------- + +.. openapi:: ../_static/schema.yaml + +Webhook Request Examples +------------------------ + +Example: start_worker_group (minimal) + +.. code-block:: json + + { + "action": "start_worker_group" + } + +Example: start_worker_group (with suggested worker_group_id and auth) + +.. code-block:: json + + { + "action": "start_worker_group", + "worker_group_id": "worker-group-12345", + "metadata": { + "instance_type": "m8gd.16xlarge", + "region": "us-east-1", + "image_tag": "python:3.11-alpine", + "tags": ["python"] + }, + "auth_token": "" + } + +Example: shutdown_worker + +.. code-block:: json + + { + "action": "shutdown_worker_group", + "worker_group_id": "worker-group-12345" + } + +Webhook Reponse Examples +------------------------ + +Success (200) + +.. code-block:: json + + { + "status": "ok", + "worker_group_id": "worker-group-12345", + "worker_ids": ["worker-1", "worker-2", "worker-3"], + "metadata": { + "instance_type": "m8gd.16xlarge", + "region": "us-east-1" + } + } + +Client error (400) + +.. code-block:: json + + { + "error": "missing required field 'action'" + } + +Unauthorized (401) + +.. code-block:: json + + { + "error": "invalid auth_token" + } + +Rate limit / resource unavailable (429) + +.. code-block:: json + + { + "error": "no available capacity, retry later" + } + +Server error (500) + +.. code-block:: json + + { + "error": "internal provisioning error" + } + +Scaling Controller Interaction +------------------------------ + +Interactions with the scaling controller can be complex given the possibility of dependency failures, network issues, +delays, etc. Here are some guidelines to ensure reliable communication between the scaling controller and the adapter: + +- **Return the correct error code**: Return an appropriate HTTP status to indicate how the controller should proceed: + - 200: action completed successfully (include resulting worker_id) + - 400: client error — fix the request (do not retry) + - 401: unauthorized — reject immediately + - 429: retryable (rate limiting or temporary lack of capacity) — controller should back off and retry + - 5xx: server error — controller should retry with backoff +- **Blocking vs asynchronous actions**: The adapter SHOULD complete the requested action before returning 200. If the + adapter cannot complete the action synchronously, return 202 Accepted and provide a status endpoint or include a + request_id in the response so the controller can poll for completion. +- **Idempotency**: Support idempotent requests. If a controller retries the same start/stop request, the adapter should + detect duplicates (eg. via a request id in metadata) and avoid double-provisioning. +- **Validation**: Validate incoming payloads and return 400 for malformed requests. Include a helpful error message in the + body following the Error schema. +- **Security**: Require an auth_token or use mutual TLS. Reject invalid auth with 401. diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..8777a64 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,26 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "OpenGRIS" +copyright = "2025, Citi" +author = "Citi" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinxcontrib.openapi"] + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..388909d --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,58 @@ +OpenGRIS: Open Standard for Grid Resource Scheduling +==================================================== + +OpenGRIS is an open standard for grid resource scheduling. It provides a standardized way to prepare, distribute, and +execute tasks across elastic and heterogeneous computing environments. + +.. https://asciiflow.com/#/share/eJytVLFuwjAQ%2FRXkmaVVh8JW8QEdaMWSxQQLKK6NHFOBEFKFOnZgiGiGjhk7VZ0qviZf0tgVJol9TlphWcLBd%2B%2B9e3fJGjH8SFCXLShtI4pXRKAuWgfoiYhoylmAupftAC3z387VRX5aqX%2Bu1UmSpcwfAtSqrix%2BzeJn795ZSfX5KicImCN82w8xnbLx70NrQIYTzmeRB3X%2FnQcOuJgRkcXvhgGA73EmBae0FAzWBZe%2F0%2FJ0SOy6TzSbW4QzoZJsgo8idZk3IzyXJ%2Bmp32IjwNMFQ7T%2FtM3yqk3Kd%2B42pLmAAnvao1PCZNn6XYnQltDMP2O4ToKDCqDqdIejWet%2BPsKSRABhSc%2BJALTUff8Feg9U6e9q0WOXqQfTk344IaOFHvhtldQ%2F37UvWdrUbuv%2BAxr4Qqhn4htNgL9BsPU1M789R3sSrwBAyv%2BbYOD2By%2Bdr%2FyyvVYOPEgN1xG89utobe3L7fCBhFKhZG8vf4Y4z05c33slqS%2B5wGNS1%2B3iAqDgyWwMjDZo8wNBgNAj) + +.. code-block:: + + ┌──────────┐ ┌──────┐ + │Scaling │ Webhooks ┌───►│Worker├──┐ + │Controller├─────────┐ ┌───────┐ │ └──────┘ │ + └──────────┘ ├──►│Adapter├─┤ ┌──────┐ │ + ┌──────┐ ▲ │ └───────┘ └───►│Worker├──┤ + ┌──┤Client├───┐ │ │ └──────┘ │ + │ └──────┘ │ │ Task Updates │ │ + │ ┌──────┐ │ ┌────┴────┐ │ ┌──────┐ │ + ├──┤Client├───┼─►│Scheduler│ │ ┌───────┐ ┌───►│Worker├──┤ + │ └──────┘ │ └────┬────┘ └──►│Adapter├─┤ └──────┘ │ + │ ┌──────┐ │ │ └───────┘ │ ┌──────┐ │ + ├──┤Client├───┘ │ └───►│Worker├──┤ + │ └──────┘ ▼ └──────┘ │ + │ ┌───────┐ │ + └────────────────►│Object │◄───────────────────────────────────────┘ + │Storage│ + └───────┘ + +OpenGRIS contains the following components: + +- Scheduler: Manages tasks using a stable and language-agnostic communication protocol. + - Includes the Scaling Controller which issues webhook requests to the Adapters. + - Reference scheduler implementation: `OpenGRIS Scaler `_. +- Worker: Executes tasks assigned by the scheduler. + - Reference worker implementation is part of OpenGRIS Scaler. + - IBM Spectrum Symphony worker implementation: + `Worker Code `_. +- Adapter: Interfaces with the scheduler to start and stop workers. + - Reference adapter implementation for VMs: + `VM Adapter Pull Request `_. + - Documentation: `Adapter Webhook API `_. +- Client Libraries: Helper libraries for implicit parallelization. + - Python client library is part of OpenGRIS Scaler. + - Graph parallelization library: `OpenGRIS Pargraph `_. + - Map-reduce parallelization library: `OpenGRIS Parfun `_. +- Object Storage: For sharing data objects between clients, schedulers, and workers. + - Reference object storage implementation: + `OpenGRIS Object Storage `_. + +Contents +======== + +.. toctree:: + :maxdepth: 2 + + protocol/scheduler + adapter/scaling + adapter/webhook diff --git a/docs/source/protocol/scheduler.rst b/docs/source/protocol/scheduler.rst new file mode 100644 index 0000000..4251df3 --- /dev/null +++ b/docs/source/protocol/scheduler.rst @@ -0,0 +1,4 @@ +Scheduler Communication Protocol +================================ + +TODO \ No newline at end of file