From dafae9d03cfcf44adb85cbf1b71f5e1388a446c6 Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Tue, 29 Jul 2025 19:28:05 -0700 Subject: [PATCH 1/8] Documentation Fixes --- doc/cli_commands_examples.md | 24 ++ doc/cli_commands_reference.md | 504 ++++++++++++++++++++++++++++++++++ doc/cli_install_configure.md | 60 ++++ doc/cli_reference.md | 48 ++++ doc/index.md | 40 ++- doc/installation.md | 25 +- 6 files changed, 697 insertions(+), 4 deletions(-) create mode 100644 doc/cli_commands_examples.md create mode 100644 doc/cli_commands_reference.md create mode 100644 doc/cli_install_configure.md create mode 100644 doc/cli_reference.md diff --git a/doc/cli_commands_examples.md b/doc/cli_commands_examples.md new file mode 100644 index 00000000..929c99c5 --- /dev/null +++ b/doc/cli_commands_examples.md @@ -0,0 +1,24 @@ +(cli_commands_examples)= + +# CLI Commands Examples + +Practical usage examples and common CLI workflows for training and inference tasks. + +## Training Example Notebooks + +For comprehensive end-to-end examples of CLI training commands, including practical workflows and best practices, see: + +- CLI Training Example + +This notebook demonstrates complete training workflows using the `hyp create hyp-pytorch-job` command with real-world scenarios, configuration options, and troubleshooting guidance. + + +## Inference Example Notebooks + +For comprehensive end-to-end examples of the CLI inference commands shown above, explore these interactive notebooks that demonstrate complete workflows with different model storage options: + +- CLI JumpStart Inference Example - Deploy pre-trained models from SageMaker JumpStart +- CLI S3 Model Inference Example - Deploy custom models stored in Amazon S3 +- CLI FSX Model Inference Example - Deploy models using Amazon FSx for high-performance storage + +These notebooks provide practical guidance on endpoint configuration, model deployment strategies, testing procedures, and troubleshooting common issues. \ No newline at end of file diff --git a/doc/cli_commands_reference.md b/doc/cli_commands_reference.md new file mode 100644 index 00000000..501f95eb --- /dev/null +++ b/doc/cli_commands_reference.md @@ -0,0 +1,504 @@ +(cli_commands_reference)= + +# CLI Commands Reference + +Complete reference for all SageMaker HyperPod CLI commands, options, and parameters. + +## Table of Contents + +- [Create Commands](#create-commands) +- [Cluster Commands](#cluster-commands) +- [Training Job Management Commands](#training-job-management-commands) +- [Inference Endpoint Management Commands](#inference-endpoint-management-commands) + +## Create Commands + +The `hyp create` command family is used to create various resources in your HyperPod cluster. + +### hyp create hyp-pytorch-job + +Create distributed PyTorch training jobs. + +#### Syntax + +```bash +hyp create hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Unique name for the training job (minimum 1 character) +- `--image TEXT`: Docker image URI containing your training code + +#### Optional Parameters + +- `--namespace TEXT`: Kubernetes namespace +- `--command ARRAY`: Command to run in the container (array of strings) +- `--args ARRAY`: Arguments for the entry script (array of strings) +- `--environment OBJECT`: Environment variables as key-value pairs +- `--pull-policy TEXT`: Image pull policy (Always, Never, IfNotPresent) +- `--instance-type TEXT`: Instance type for training +- `--node-count INTEGER`: Number of nodes (minimum: 1) +- `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1) +- `--label-selector OBJECT`: Node label selector as key-value pairs +- `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check +- `--scheduler-type TEXT`: Scheduler type +- `--queue-name TEXT`: Queue name for job scheduling +- `--priority TEXT`: Priority class for job scheduling +- `--max-retry INTEGER`: Maximum number of job retries (minimum: 0) +- `--volumes ARRAY`: List of volumes to mount +- `--persistent-volume-claims ARRAY`: List of persistent volume claims +- `--service-account-name TEXT`: Service account name + +### hyp create hyp-jumpstart-endpoint + +Deploy pre-trained models from SageMaker JumpStart. + +#### Syntax + +```bash +hyp create hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--model-id TEXT`: JumpStart model identifier (1-63 characters, alphanumeric with hyphens) +- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.") + +#### Optional Parameters + +- `--accept-eula BOOLEAN`: Whether model terms of use have been accepted (default: false) +- `--model-version TEXT`: Semantic version of the model (e.g., "1.0.0", 5-14 characters) +- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) +- `--tls-certificate-output-s3-uri TEXT`: S3 URI to write the TLS certificate (optional) + +### hyp create hyp-custom-endpoint + +Deploy custom models with your own inference code. + +#### Syntax + +```bash +hyp create hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.") +- `--model-name TEXT`: Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens) +- `--model-source-type TEXT`: Model source type ("s3" or "fsx") +- `--image-uri TEXT`: Docker image URI for inference +- `--container-port INTEGER`: Port on which model server listens (1-65535) +- `--model-volume-mount-name TEXT`: Name of the model volume mount + +#### Optional Parameters + +- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) +- `--env OBJECT`: Environment variables as key-value pairs +- `--metrics-enabled BOOLEAN`: Enable metrics collection (default: false) +- `--model-version TEXT`: Version of the model (semantic version format) +- `--model-location TEXT`: Specific model data location +- `--prefetch-enabled BOOLEAN`: Whether to pre-fetch model data (default: false) +- `--tls-certificate-output-s3-uri TEXT`: S3 URI for TLS certificate output +- `--fsx-dns-name TEXT`: FSx File System DNS Name +- `--fsx-file-system-id TEXT`: FSx File System ID +- `--fsx-mount-name TEXT`: FSx File System Mount Name +- `--s3-bucket-name TEXT`: S3 bucket location +- `--s3-region TEXT`: S3 bucket region +- `--model-volume-mount-path TEXT`: Path inside container for model volume (default: "/opt/ml/model") +- `--resources-limits OBJECT`: Resource limits for the worker +- `--resources-requests OBJECT`: Resource requests for the worker +- `--dimensions OBJECT`: CloudWatch Metric dimensions as key-value pairs +- `--metric-collection-period INTEGER`: Period for CloudWatch query (default: 300) +- `--metric-collection-start-time INTEGER`: StartTime for CloudWatch query (default: 300) +- `--metric-name TEXT`: Metric name to query for CloudWatch trigger +- `--metric-stat TEXT`: Statistics metric for CloudWatch (default: "Average") +- `--metric-type TEXT`: Type of metric for HPA ("Value" or "Average", default: "Average") +- `--min-value NUMBER`: Minimum metric value for empty CloudWatch response (default: 0) +- `--cloud-watch-trigger-name TEXT`: Name for the CloudWatch trigger +- `--cloud-watch-trigger-namespace TEXT`: AWS CloudWatch namespace for the metric +- `--target-value NUMBER`: Target value for the CloudWatch metric +- `--use-cached-metrics BOOLEAN`: Enable caching of metric values (default: true) +- `--invocation-endpoint TEXT`: Invocation endpoint path (default: "invocations") + +## Cluster Commands + +Commands for managing cluster connections and contexts. + +### hyp list-cluster + +List all available SageMaker HyperPod clusters in your account. + +#### Syntax + +```bash +hyp list-cluster [OPTIONS] +``` + +#### Optional Parameters + +- `--region TEXT`: AWS region to list clusters from +- `--namespace TEXT`: Kubernetes namespace +- `--output TEXT`: Output format (json, table) + +### hyp set-cluster-context + +Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster. + +#### Syntax + +```bash +hyp set-cluster-context [OPTIONS] +``` + +#### Required Parameters + +- `--cluster-name TEXT`: Name of the cluster to connect to + +### hyp get-cluster-context + +View information about the currently configured cluster context. + +#### Syntax + +```bash +hyp get-cluster-context +``` + +## Training Job Management Commands + +Commands for managing PyTorch training jobs. + +### hyp list hyp-pytorch-job + +List all HyperPod PyTorch jobs in a namespace. + +#### Syntax + +```bash +hyp list hyp-pytorch-job [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace to list jobs from (default: "default") + +### hyp describe hyp-pytorch-job + +Describe a specific HyperPod PyTorch job. + +#### Syntax + +```bash +hyp describe hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to describe + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp delete hyp-pytorch-job + +Delete a HyperPod PyTorch job. + +#### Syntax + +```bash +hyp delete hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to delete + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp list-pods hyp-pytorch-job + +List all pods associated with a PyTorch job. + +#### Syntax + +```bash +hyp list-pods hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to list pods for + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp get-logs hyp-pytorch-job + +Get logs from a specific pod in a PyTorch job. + +#### Syntax + +```bash +hyp get-logs hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +## Inference Endpoint Management Commands + +Commands for managing inference endpoints. + +### hyp list hyp-jumpstart-endpoint + +List JumpStart model endpoints. + +#### Syntax + +```bash +hyp list hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list endpoints from (default: "default") + +### hyp list hyp-custom-endpoint + +List custom model endpoints. + +#### Syntax + +```bash +hyp list hyp-custom-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list endpoints from (default: "default") + +### hyp describe hyp-jumpstart-endpoint + +Describe a JumpStart model endpoint. + +#### Syntax + +```bash +hyp describe hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to describe + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") +- `--full`: Display full JSON output + +### hyp describe hyp-custom-endpoint + +Describe a custom model endpoint. + +#### Syntax + +```bash +hyp describe hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to describe + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") +- `--full`: Display full JSON output + +### hyp invoke hyp-jumpstart-endpoint + +Invoke a JumpStart model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--endpoint-name TEXT`: Name of the endpoint to invoke +- `--body TEXT`: Request body (JSON format) + +#### Optional Parameters + +- `--content-type TEXT`: Content type of the request (default: "application/json") + +### hyp invoke hyp-custom-endpoint + +Invoke a custom model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--endpoint-name TEXT`: Name of the endpoint to invoke +- `--body TEXT`: Request body (JSON format) + +#### Optional Parameters + +- `--content-type TEXT`: Content type of the request (default: "application/json") + +### hyp delete hyp-jumpstart-endpoint + +Delete a JumpStart model endpoint. + +#### Syntax + +```bash +hyp delete hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to delete + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") + +### hyp delete hyp-custom-endpoint + +Delete a custom model endpoint. + +#### Syntax + +```bash +hyp delete hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to delete + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") + +### hyp list-pods hyp-jumpstart-endpoint + +List pods for JumpStart endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list pods from (default: "default") + +### hyp list-pods hyp-custom-endpoint + +List pods for custom endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-custom-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list pods from (default: "default") + +### hyp get-logs hyp-jumpstart-endpoint + +Get logs from JumpStart endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--container TEXT`: Container name to get logs from +- `--namespace TEXT`: Namespace of the pod (default: "default") + +### hyp get-logs hyp-custom-endpoint + +Get logs from custom endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--container TEXT`: Container name to get logs from +- `--namespace TEXT`: Namespace of the pod (default: "default") + +### hyp get-operator-logs hyp-jumpstart-endpoint + +Get operator logs for JumpStart endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--since-hours FLOAT`: Time frame to get logs for (in hours) + +### hyp get-operator-logs hyp-custom-endpoint + +Get operator logs for custom endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--since-hours FLOAT`: Time frame to get logs for (in hours) + +## Parameter Reference + +### Common Parameters Across Commands + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `--namespace` | TEXT | Kubernetes namespace | Current context | +| `--help` | FLAG | Show command help | - | \ No newline at end of file diff --git a/doc/cli_install_configure.md b/doc/cli_install_configure.md new file mode 100644 index 00000000..64109e91 --- /dev/null +++ b/doc/cli_install_configure.md @@ -0,0 +1,60 @@ +(cli_install_configure)= + +# Install and Configure CLI + +This guide provides installation instructions for the SageMaker HyperPod CLI and SDK. + +## Installation Options + +### Install from PyPI + +It's recommended to install the SageMaker HyperPod CLI and SDK in a Python virtual environment to avoid conflicts with other packages: + +```bash +# Create a virtual environment +python -m venv {venv-name} + +# Activate the virtual environment +source {venv-name}/bin/activate +``` + +```{note} +Remember to activate your virtual environment (source {venv-name}/bin/activate) each time you want to use the HyperPod CLI and SDK if you chose the virtual environment installation method. +``` + +You can install the SageMaker HyperPod CLI and SDK directly using `pip`: + +```bash +# Install from PyPI +pip install sagemaker-hyperpod +``` + +To verify that the installation was successful, run: + +```bash +# Verify CLI installation +hyp --help +``` + +### Install from GitHub + +For the latest development version or to contribute to the project, you can install directly from the GitHub repository: + +**Clone the SageMaker HyperPod CLI package from GitHub:** +```bash +git clone https://github.com/aws/sagemaker-hyperpod-cli.git +``` + +**Install the SageMaker HyperPod CLI:** +```bash +cd sagemaker-hyperpod-cli && pip install . +``` + +**Test if the SageMaker HyperPod CLI is successfully installed by running the following command:** +```bash +hyp --help +``` + +```{note} +The GitHub installation provides access to the latest features and bug fixes that may not yet be available in the PyPI release. However, it may be less stable than the official PyPI release. +``` diff --git a/doc/cli_reference.md b/doc/cli_reference.md new file mode 100644 index 00000000..cb879c31 --- /dev/null +++ b/doc/cli_reference.md @@ -0,0 +1,48 @@ +(cli_reference)= + +# CLI Reference + +```{toctree} +:hidden: +:maxdepth: 2 + +cli_install_configure +cli_commands_reference +cli_commands_examples +``` + +Complete reference for the SageMaker HyperPod Command Line Interface. + +::::{container} +::::{grid} 1 1 3 3 +:gutter: 3 + +:::{grid-item-card} Install and Configure CLI +:link: cli_install_configure +:link-type: ref +:class-card: sd-border-primary + +**Setup and Configuration** +Get started with CLI installation, authentication, and initial configuration for your HyperPod environment. +::: + +:::{grid-item-card} CLI Commands Reference +:link: cli_commands_reference +:link-type: ref +:class-card: sd-border-secondary + +**Complete Command Documentation** +Comprehensive reference for all available CLI commands, options, and parameters. +::: + +:::{grid-item-card} CLI Commands Examples +:link: cli_commands_examples +:link-type: ref +:class-card: sd-border-success + +**Practical Usage Examples** +Real-world examples and common CLI workflows for training and inference tasks. +::: + +:::: +:::: \ No newline at end of file diff --git a/doc/index.md b/doc/index.md index e04a4471..c9c5f348 100644 --- a/doc/index.md +++ b/doc/index.md @@ -9,7 +9,7 @@ keywords: (hpcli_docs_mainpage)= -# SageMaker HyperPod CLI & SDK +# Overview ```{toctree} :hidden: @@ -19,13 +19,47 @@ Installation Getting Started Training Inference +CLI Reference Example Notebooks API reference <_apidoc/modules> ``` -**Manage distributed Machine Learning workloads on Kubernetes clusters without the complexity.** +Amazon SageMaker HyperPod CLI and SDK are developer tools designed to simplify the management of distributed training workloads on dedicated, high-performance computing clusters. These tools enable ML practitioners to efficiently orchestrate large-scale training operations while abstracting the underlying cluster management complexities. -The SageMaker HyperPod Command Line Interface and SDK simplify distributed training and inference on EKS-orchestrated clusters. +### What is SageMaker HyperPod CLI and SDK? + +The **SageMaker HyperPod CLI** is a command-line interface that enables you to create and manage distributed training clusters and workloads through simple commands. It provides direct control over cluster resources while handling the infrastructure management automatically. + +The **SageMaker HyperPod SDK** is a Python library that allows programmatic access to HyperPod functionality for seamless incorporation into your ML workflows and training scripts. + +Both tools are built on top of [Amazon SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html), a managed service that provides dedicated, persistent clusters optimized for distributed ML training workloads. + +### Key Use Cases + +**Distributed Training** +- Scale PyTorch training jobs across multiple nodes and GPUs +- Manage complex distributed training configurations with simple commands +- Handle fault tolerance and job recovery automatically + +**Model Inference** +- Deploy pre-trained models from SageMaker JumpStart with minimal configuration +- Host custom inference endpoints with auto-scaling capabilities +- Manage model serving infrastructure with built-in monitoring + +**Cluster Operations** +- Connect to and manage multiple HyperPod clusters +- Monitor resource utilization and job status +- Streamline DevOps workflows for ML teams + +### Why Choose HyperPod CLI & SDK? + +- **Simplified Management**: Focus on ML code while HyperPod handles infrastructure orchestration +- **AWS Integration**: Native integration with SageMaker features and AWS services +- **Production Ready**: Built-in fault tolerance, auto-scaling, and enterprise security features +- **Development Flexibility**: Choose between CLI for direct control or SDK for programmatic access +- **Cost Management**: Optimize spending with cluster sharing and resource monitoring + +For comprehensive information about the underlying infrastructure and advanced configuration options, see the [Amazon SageMaker HyperPod documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html). ## Quick Start diff --git a/doc/installation.md b/doc/installation.md index 041fae08..f9dbf85c 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -1,5 +1,5 @@ (installation)= -# Installation +# Get Started This guide provides installation instructions for the SageMaker HyperPod CLI and SDK. ## System Requirements @@ -60,3 +60,26 @@ To verify that the installation was successful, run: # Verify CLI installation hyp --help ``` + +### Install from GitHub + +For the latest development version or to contribute to the project, you can install directly from the GitHub repository: + +**Clone the SageMaker HyperPod CLI package from GitHub:** +```bash +git clone https://github.com/aws/sagemaker-hyperpod-cli.git +``` + +**Install the SageMaker HyperPod CLI:** +```bash +cd sagemaker-hyperpod-cli && pip install . +``` + +**Test if the SageMaker HyperPod CLI is successfully installed by running the following command:** +```bash +hyp --help +``` + +```{note} +The GitHub installation provides access to the latest features and bug fixes that may not yet be available in the PyPI release. However, it may be less stable than the official PyPI release. +``` \ No newline at end of file From 299e63b873c9cdb7574dd96ac6f76222fd78817b Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Wed, 30 Jul 2025 20:00:35 -0700 Subject: [PATCH 2/8] Documentation Fixes --- doc/Makefile | 2 +- doc/cli_reference.md | 10 --- doc/conf.py | 5 +- doc/index.md | 187 ++++++++++++++----------------------------- 4 files changed, 66 insertions(+), 138 deletions(-) diff --git a/doc/Makefile b/doc/Makefile index d33a0a7e..c8d71c96 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = -W +SPHINXOPTS = SPHINXBUILD = python3 -msphinx SPHINXPROJ = sagemaker SOURCEDIR = . diff --git a/doc/cli_reference.md b/doc/cli_reference.md index cb879c31..be0c96c5 100644 --- a/doc/cli_reference.md +++ b/doc/cli_reference.md @@ -6,7 +6,6 @@ :hidden: :maxdepth: 2 -cli_install_configure cli_commands_reference cli_commands_examples ``` @@ -17,15 +16,6 @@ Complete reference for the SageMaker HyperPod Command Line Interface. ::::{grid} 1 1 3 3 :gutter: 3 -:::{grid-item-card} Install and Configure CLI -:link: cli_install_configure -:link-type: ref -:class-card: sd-border-primary - -**Setup and Configuration** -Get started with CLI installation, authentication, and initial configuration for your HyperPod environment. -::: - :::{grid-item-card} CLI Commands Reference :link: cli_commands_reference :link-type: ref diff --git a/doc/conf.py b/doc/conf.py index dfe7ab3c..6d75ac16 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -131,7 +131,10 @@ def get_version(): "show_navbar_depth": 2, "use_fullscreen_button": False, "use_download_button": False, - "home_page_in_toc": True + "home_page_in_toc": True, + # Configuration to disable right-side table of contents + "secondary_sidebar_items": [], # Remove all content from right sidebar + "show_toc_level": 0, # Disable automatic TOC generation } author = "Amazon Web Services" diff --git a/doc/index.md b/doc/index.md index c9c5f348..3967741f 100644 --- a/doc/index.md +++ b/doc/index.md @@ -23,47 +23,58 @@ CLI Reference Example Notebooks API reference <_apidoc/modules> ``` +:::::{grid} 2 +:gutter: 3 +:margin: 0 -Amazon SageMaker HyperPod CLI and SDK are developer tools designed to simplify the management of distributed training workloads on dedicated, high-performance computing clusters. These tools enable ML practitioners to efficiently orchestrate large-scale training operations while abstracting the underlying cluster management complexities. - -### What is SageMaker HyperPod CLI and SDK? - -The **SageMaker HyperPod CLI** is a command-line interface that enables you to create and manage distributed training clusters and workloads through simple commands. It provides direct control over cluster resources while handling the infrastructure management automatically. +::::{grid-item} +:columns: 8 -The **SageMaker HyperPod SDK** is a Python library that allows programmatic access to HyperPod functionality for seamless incorporation into your ML workflows and training scripts. +Amazon Hyperpod helps you provision and manage resilient clusters optimized for large-scale machine learning (ML) workloads, including large language models (LLMs), diffusion models, and foundation models (FMs). +To get started with Hyperpod, visit the [AWS Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html). -Both tools are built on top of [Amazon SageMaker HyperPod](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html), a managed service that provides dedicated, persistent clusters optimized for distributed ML training workloads. +### What is SageMaker HyperPod CLI and SDK? -### Key Use Cases +Amazon SageMaker HyperPod CLI and SDK are developer tools designed to simplify the management of distributed training workloads on dedicated, high-performance computing clusters. +:::: +::::{grid-item} +:columns: 4 +```{note} +Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0. +``` +:::: +::::: +:::::{grid} 2 +:gutter: 3 +:margin: 0 -**Distributed Training** -- Scale PyTorch training jobs across multiple nodes and GPUs -- Manage complex distributed training configurations with simple commands -- Handle fault tolerance and job recovery automatically +::::{grid-item} +:columns: 8 +### Why Choose HyperPod CLI & SDK? -**Model Inference** -- Deploy pre-trained models from SageMaker JumpStart with minimal configuration -- Host custom inference endpoints with auto-scaling capabilities -- Manage model serving infrastructure with built-in monitoring +Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Weather it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: +- Accelerate development cycles and reduce operational overhead +- Automate ML workflows while maintaining operational visibility +- Optimize computing resources across your AI/ML projects -**Cluster Operations** -- Connect to and manage multiple HyperPod clusters -- Monitor resource utilization and job status -- Streamline DevOps workflows for ML teams +:::: +::::{grid-item} +:columns: 4 +```{admonition} What's New +:class: important -### Why Choose HyperPod CLI & SDK? +**🚀 CLI and SDK!!** -- **Simplified Management**: Focus on ML code while HyperPod handles infrastructure orchestration -- **AWS Integration**: Native integration with SageMaker features and AWS services -- **Production Ready**: Built-in fault tolerance, auto-scaling, and enterprise security features -- **Development Flexibility**: Choose between CLI for direct control or SDK for programmatic access -- **Cost Management**: Optimize spending with cluster sharing and resource monitoring +Streamlined interfaces for Training, +Inference, and Cluster Monitoring. -For comprehensive information about the underlying infrastructure and advanced configuration options, see the [Amazon SageMaker HyperPod documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html). +``` +:::: +::::: ## Quick Start -::::{container} + ::::{grid} 1 2 2 2 :gutter: 3 @@ -83,119 +94,43 @@ For comprehensive information about the underlying infrastructure and advanced c **Ready to explore?** Connect to your cluster before running ML workflows. ::: -:::: -:::: - -## What You Can Do - -::::{container} -::::{grid} 1 1 2 2 -:gutter: 3 - -:::{grid-item-card} Training Workloads -:class-card: sd-border-success - -**Distributed Training** -- HyperPodPytorchJob distributed training -- Multi-node, multi-GPU support -- Built-in monitoring and logging - -```{dropdown} Learn More About Training -:color: success -:icon: chevron-down +:::{grid-item-card} Training +:link: training +:link-type: ref +:class-card: sd-border-secondary -- [Training Guide](training.md) - Complete training workflows -- [Example Notebooks](examples.md) - Hands-on training examples -- Supported frameworks: PyTorch -``` +**Scale Your ML Models!** Get started with training ::: -:::{grid-item-card} Inference Endpoints -:class-card: sd-border-info - -**Model Serving** -- Deploy models as scalable endpoints -- JumpStart model integration -- Real-time and batch inference - -```{dropdown} Learn More About Inference -:color: info -:icon: chevron-down +:::{grid-item-card} Inference +:link: inference +:link-type: ref +:class-card: sd-border-secondary -- [Inference Guide](inference.md) - Complete inference workflows -- [Example Notebooks](examples.md) - Hands-on inference examples -- Supported models: JumpStart models, Custom models -``` +**Deploy Your ML Model!** Get started with inference ::: -:::: :::: -## Choose Your Interface +## Advanced Resources -::::{container} -::::{grid} 1 1 2 2 +::::{grid} 1 2 2 2 :gutter: 3 -:::{grid-item-card} Command Line Interface -:class-card: sd-border-warning - -**For DevOps & Quick Tasks** -```bash -# Launch a training job -hyp create hyp-pytorch-job \ - --job-name my-training \ - --image pytorch/pytorch:latest \ -``` - -```{dropdown} CLI Features -:color: warning -:icon: terminal +:::{grid-item-card} API reference +:link: _apidoc/modules +:link-type: ref +:class-card: sd-border-primary -- Interactive job management -- Built-in status monitoring -``` +**Explore APIs** - Checkout API Documentation ::: -:::{grid-item-card} Python SDK -:class-card: sd-border-danger - -**For Programmatic Control** -```python -from sagemaker.hyperpod.training import HyperPodPytorchJob -from sagemaker.hyperpod.common.config import Metadata - -pytorch_job = HyperPodPytorchJob( - metadata=Metadata(name="demo"), - nproc_per_node="1", - replica_specs=replica_specs, - run_policy=run_policy, -) - -pytorch_job.create() -``` - -```{dropdown} SDK Features -:color: danger -:icon: code +:::{grid-item-card} Github +:link: examples +:link-type: ref +:class-card: sd-border-secondary -- Pythonic API design -- Jupyter notebook integration -- Programmatic job orchestration -``` +**Example Notebooks** - Ready-to-use implementation guides ::: :::: -:::: - -## Advanced Resources - -```{dropdown} Complete Documentation -:color: primary -:icon: book - -- [API Reference](_apidoc/modules.rst) - Complete SDK documentation -- [Training Guide](training.md) - In-depth training workflows -- [Inference Guide](inference.md) - Comprehensive inference setup -- [Example Notebooks](examples.md) - End-to-end examples -``` \ No newline at end of file From df24d8ac0d9a51cad86f168a68ccb296ed47cec0 Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Thu, 31 Jul 2025 10:36:24 -0700 Subject: [PATCH 3/8] Documentation Fixes --- doc/cli_install_configure.md | 60 ------------------------------------ doc/index.md | 3 +- 2 files changed, 1 insertion(+), 62 deletions(-) delete mode 100644 doc/cli_install_configure.md diff --git a/doc/cli_install_configure.md b/doc/cli_install_configure.md deleted file mode 100644 index 64109e91..00000000 --- a/doc/cli_install_configure.md +++ /dev/null @@ -1,60 +0,0 @@ -(cli_install_configure)= - -# Install and Configure CLI - -This guide provides installation instructions for the SageMaker HyperPod CLI and SDK. - -## Installation Options - -### Install from PyPI - -It's recommended to install the SageMaker HyperPod CLI and SDK in a Python virtual environment to avoid conflicts with other packages: - -```bash -# Create a virtual environment -python -m venv {venv-name} - -# Activate the virtual environment -source {venv-name}/bin/activate -``` - -```{note} -Remember to activate your virtual environment (source {venv-name}/bin/activate) each time you want to use the HyperPod CLI and SDK if you chose the virtual environment installation method. -``` - -You can install the SageMaker HyperPod CLI and SDK directly using `pip`: - -```bash -# Install from PyPI -pip install sagemaker-hyperpod -``` - -To verify that the installation was successful, run: - -```bash -# Verify CLI installation -hyp --help -``` - -### Install from GitHub - -For the latest development version or to contribute to the project, you can install directly from the GitHub repository: - -**Clone the SageMaker HyperPod CLI package from GitHub:** -```bash -git clone https://github.com/aws/sagemaker-hyperpod-cli.git -``` - -**Install the SageMaker HyperPod CLI:** -```bash -cd sagemaker-hyperpod-cli && pip install . -``` - -**Test if the SageMaker HyperPod CLI is successfully installed by running the following command:** -```bash -hyp --help -``` - -```{note} -The GitHub installation provides access to the latest features and bug fixes that may not yet be available in the PyPI release. However, it may be less stable than the official PyPI release. -``` diff --git a/doc/index.md b/doc/index.md index 3967741f..d53ea5f3 100644 --- a/doc/index.md +++ b/doc/index.md @@ -118,8 +118,7 @@ Inference, and Cluster Monitoring. :gutter: 3 :::{grid-item-card} API reference -:link: _apidoc/modules -:link-type: ref +:link: _apidoc/modules.html :class-card: sd-border-primary **Explore APIs** - Checkout API Documentation From cc15eef108b02a598e2afa6b1b5198b98f01f061 Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Fri, 1 Aug 2025 13:45:28 -0700 Subject: [PATCH 4/8] Documentation Fixes --- doc/cli_commands_examples.md | 24 --- ...commands_reference.md => cli_inference.md} | 191 +----------------- doc/cli_reference.md | 20 +- doc/cli_training.md | 163 +++++++++++++++ doc/conf.py | 5 +- doc/examples.md | 45 ++++- doc/index.md | 33 +-- 7 files changed, 226 insertions(+), 255 deletions(-) delete mode 100644 doc/cli_commands_examples.md rename doc/{cli_commands_reference.md => cli_inference.md} (64%) create mode 100644 doc/cli_training.md diff --git a/doc/cli_commands_examples.md b/doc/cli_commands_examples.md deleted file mode 100644 index 929c99c5..00000000 --- a/doc/cli_commands_examples.md +++ /dev/null @@ -1,24 +0,0 @@ -(cli_commands_examples)= - -# CLI Commands Examples - -Practical usage examples and common CLI workflows for training and inference tasks. - -## Training Example Notebooks - -For comprehensive end-to-end examples of CLI training commands, including practical workflows and best practices, see: - -- CLI Training Example - -This notebook demonstrates complete training workflows using the `hyp create hyp-pytorch-job` command with real-world scenarios, configuration options, and troubleshooting guidance. - - -## Inference Example Notebooks - -For comprehensive end-to-end examples of the CLI inference commands shown above, explore these interactive notebooks that demonstrate complete workflows with different model storage options: - -- CLI JumpStart Inference Example - Deploy pre-trained models from SageMaker JumpStart -- CLI S3 Model Inference Example - Deploy custom models stored in Amazon S3 -- CLI FSX Model Inference Example - Deploy models using Amazon FSx for high-performance storage - -These notebooks provide practical guidance on endpoint configuration, model deployment strategies, testing procedures, and troubleshooting common issues. \ No newline at end of file diff --git a/doc/cli_commands_reference.md b/doc/cli_inference.md similarity index 64% rename from doc/cli_commands_reference.md rename to doc/cli_inference.md index 501f95eb..0917f2d3 100644 --- a/doc/cli_commands_reference.md +++ b/doc/cli_inference.md @@ -1,56 +1,10 @@ -(cli_commands_reference)= +(cli_inference)= -# CLI Commands Reference +# Inference CLI -Complete reference for all SageMaker HyperPod CLI commands, options, and parameters. +Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. -## Table of Contents - -- [Create Commands](#create-commands) -- [Cluster Commands](#cluster-commands) -- [Training Job Management Commands](#training-job-management-commands) -- [Inference Endpoint Management Commands](#inference-endpoint-management-commands) - -## Create Commands - -The `hyp create` command family is used to create various resources in your HyperPod cluster. - -### hyp create hyp-pytorch-job - -Create distributed PyTorch training jobs. - -#### Syntax - -```bash -hyp create hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Unique name for the training job (minimum 1 character) -- `--image TEXT`: Docker image URI containing your training code - -#### Optional Parameters - -- `--namespace TEXT`: Kubernetes namespace -- `--command ARRAY`: Command to run in the container (array of strings) -- `--args ARRAY`: Arguments for the entry script (array of strings) -- `--environment OBJECT`: Environment variables as key-value pairs -- `--pull-policy TEXT`: Image pull policy (Always, Never, IfNotPresent) -- `--instance-type TEXT`: Instance type for training -- `--node-count INTEGER`: Number of nodes (minimum: 1) -- `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1) -- `--label-selector OBJECT`: Node label selector as key-value pairs -- `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check -- `--scheduler-type TEXT`: Scheduler type -- `--queue-name TEXT`: Queue name for job scheduling -- `--priority TEXT`: Priority class for job scheduling -- `--max-retry INTEGER`: Maximum number of job retries (minimum: 0) -- `--volumes ARRAY`: List of volumes to mount -- `--persistent-volume-claims ARRAY`: List of persistent volume claims -- `--service-account-name TEXT`: Service account name - -### hyp create hyp-jumpstart-endpoint +## hyp create hyp-jumpstart-endpoint Deploy pre-trained models from SageMaker JumpStart. @@ -121,141 +75,6 @@ hyp create hyp-custom-endpoint [OPTIONS] - `--use-cached-metrics BOOLEAN`: Enable caching of metric values (default: true) - `--invocation-endpoint TEXT`: Invocation endpoint path (default: "invocations") -## Cluster Commands - -Commands for managing cluster connections and contexts. - -### hyp list-cluster - -List all available SageMaker HyperPod clusters in your account. - -#### Syntax - -```bash -hyp list-cluster [OPTIONS] -``` - -#### Optional Parameters - -- `--region TEXT`: AWS region to list clusters from -- `--namespace TEXT`: Kubernetes namespace -- `--output TEXT`: Output format (json, table) - -### hyp set-cluster-context - -Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster. - -#### Syntax - -```bash -hyp set-cluster-context [OPTIONS] -``` - -#### Required Parameters - -- `--cluster-name TEXT`: Name of the cluster to connect to - -### hyp get-cluster-context - -View information about the currently configured cluster context. - -#### Syntax - -```bash -hyp get-cluster-context -``` - -## Training Job Management Commands - -Commands for managing PyTorch training jobs. - -### hyp list hyp-pytorch-job - -List all HyperPod PyTorch jobs in a namespace. - -#### Syntax - -```bash -hyp list hyp-pytorch-job [OPTIONS] -``` - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace to list jobs from (default: "default") - -### hyp describe hyp-pytorch-job - -Describe a specific HyperPod PyTorch job. - -#### Syntax - -```bash -hyp describe hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job to describe - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - -### hyp delete hyp-pytorch-job - -Delete a HyperPod PyTorch job. - -#### Syntax - -```bash -hyp delete hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job to delete - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - -### hyp list-pods hyp-pytorch-job - -List all pods associated with a PyTorch job. - -#### Syntax - -```bash -hyp list-pods hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job to list pods for - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - -### hyp get-logs hyp-pytorch-job - -Get logs from a specific pod in a PyTorch job. - -#### Syntax - -```bash -hyp get-logs hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job -- `--pod-name TEXT`: Name of the pod to get logs from - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - ## Inference Endpoint Management Commands Commands for managing inference endpoints. @@ -501,4 +320,4 @@ hyp get-operator-logs hyp-custom-endpoint [OPTIONS] | Parameter | Type | Description | Default | |-----------|------|-------------|---------| | `--namespace` | TEXT | Kubernetes namespace | Current context | -| `--help` | FLAG | Show command help | - | \ No newline at end of file +| `--help` | FLAG | Show command help | - | diff --git a/doc/cli_reference.md b/doc/cli_reference.md index be0c96c5..744ab4ed 100644 --- a/doc/cli_reference.md +++ b/doc/cli_reference.md @@ -6,8 +6,8 @@ :hidden: :maxdepth: 2 -cli_commands_reference -cli_commands_examples +cli_training +cli_inference ``` Complete reference for the SageMaker HyperPod Command Line Interface. @@ -16,22 +16,20 @@ Complete reference for the SageMaker HyperPod Command Line Interface. ::::{grid} 1 1 3 3 :gutter: 3 -:::{grid-item-card} CLI Commands Reference -:link: cli_commands_reference +:::{grid-item-card} Training CLI +:link: cli_training :link-type: ref :class-card: sd-border-secondary -**Complete Command Documentation** -Comprehensive reference for all available CLI commands, options, and parameters. +Training CLI commands, options and parameters. ::: -:::{grid-item-card} CLI Commands Examples -:link: cli_commands_examples +:::{grid-item-card} Inference CLI +:link: cli_inference :link-type: ref -:class-card: sd-border-success +:class-card: sd-border-secondary -**Practical Usage Examples** -Real-world examples and common CLI workflows for training and inference tasks. +Inference CLI commands, options and parameters. ::: :::: diff --git a/doc/cli_training.md b/doc/cli_training.md new file mode 100644 index 00000000..70b4bdc0 --- /dev/null +++ b/doc/cli_training.md @@ -0,0 +1,163 @@ +(cli_training)= + +# Training CLI + +Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. + +## hyp create hyp-pytorch-job + +Create distributed PyTorch training jobs on SageMaker HyperPod clusters. + +### Syntax + +```bash +hyp create hyp-pytorch-job [OPTIONS] +``` + +### Required Parameters + +- `--job-name TEXT`: Unique name for the training job (1-63 characters, alphanumeric with hyphens) +- `--image TEXT`: Docker image URI containing your training code + +### Optional Parameters + +- `--namespace TEXT`: Kubernetes namespace +- `--command ARRAY`: Command to run in the container (array of strings) +- `--args ARRAY`: Arguments for the entry script (array of strings) +- `--environment OBJECT`: Environment variables as key-value pairs +- `--pull-policy TEXT`: Image pull policy (Always, Never, IfNotPresent) +- `--instance-type TEXT`: Instance type for training +- `--node-count INTEGER`: Number of nodes (minimum: 1) +- `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1) +- `--label-selector OBJECT`: Node label selector as key-value pairs +- `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check (default: false) +- `--scheduler-type TEXT`: Scheduler type +- `--queue-name TEXT`: Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) +- `--priority TEXT`: Priority class for job scheduling +- `--max-retry INTEGER`: Maximum number of job retries (minimum: 0) +- `--volume ARRAY`: List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info) +- `--service-account-name TEXT`: Service account name + +### Volume Configuration + +The `--volume` parameter supports mounting different types of storage to your training containers. + +### Volume Syntax + +```bash +--volume name=,type=,mount_path=[,additional_options] +``` + +### Volume Types + +**hostPath Volume** +```bash +--volume name=model-data,type=hostPath,mount_path=/data,path=/host/data +``` + +**Persistent Volume Claim (PVC)** +```bash +--volume name=training-output,type=pvc,mount_path=/output,claim_name=training-pvc,read_only=false +``` + +### Volume Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `name` | TEXT | Yes | Volume name | +| `type` | TEXT | Yes | Volume type (`hostPath` or `pvc`) | +| `mount_path` | TEXT | Yes | Mount path in container | +| `path` | TEXT | For hostPath | Host path for hostPath volumes | +| `claim_name` | TEXT | For pvc | PVC claim name for pvc volumes | +| `read_only` | BOOLEAN | No | Read-only flag for pvc volumes | + +## Training Job Management Commands + +Commands for managing PyTorch training jobs. + +### hyp list hyp-pytorch-job + +List all HyperPod PyTorch jobs in a namespace. + +#### Syntax + +```bash +hyp list hyp-pytorch-job [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace to list jobs from (default: "default") + +### hyp describe hyp-pytorch-job + +Describe a specific HyperPod PyTorch job. + +#### Syntax + +```bash +hyp describe hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to describe + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp delete hyp-pytorch-job + +Delete a HyperPod PyTorch job. + +#### Syntax + +```bash +hyp delete hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to delete + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp list-pods hyp-pytorch-job + +List all pods associated with a PyTorch job. + +#### Syntax + +```bash +hyp list-pods hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to list pods for + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp get-logs hyp-pytorch-job + +Get logs from a specific pod in a PyTorch job. + +#### Syntax + +```bash +hyp get-logs hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") diff --git a/doc/conf.py b/doc/conf.py index 6d75ac16..c1398ba7 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -17,7 +17,10 @@ import shutil import sys import re +import json from pathlib import Path +from typing import Dict, List, Any, Optional + def run_apidoc(app): """Generate doc stubs using sphinx-apidoc.""" @@ -51,7 +54,7 @@ def run_apidoc(app): def setup(app): - """Register our sphinx-apidoc hook.""" + """Register our sphinx hooks.""" app.connect("builder-inited", run_apidoc) diff --git a/doc/examples.md b/doc/examples.md index e2ac9161..afda4a66 100644 --- a/doc/examples.md +++ b/doc/examples.md @@ -6,16 +6,45 @@ For detailed examples of training with HyperPod, see: -- CLI Training Example -- SDK Training Example +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Training Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-e2e-cli.ipynb +:class-card: sd-border-primary + +**Training Examples** Refer the Training Example. +::: + +:::{grid-item-card} SDK Training Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/SDK/training_sdk_example.ipynb +:class-card: sd-border-primary + +**Training Examples** Refer the Training SDK Example. +::: + +:::: + ## Inference Example Notebooks For detailed examples of inference with HyperPod, see: -- CLI Inference FSX Model Example -- CLI Inference JumpStart Model Example -- CLI Inference S3 Model Example -- SDK Inference FSX Model Example -- SDK Inference JumpStart Model Example -- SDK Inference S3 Model Example +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Inference Examples +- CLI Inference JumpStart Model Example +- CLI Inference FSX Model Example +- CLI Inference S3 Model Example + +::: + +:::{grid-item-card} SDK Inference Example +- SDK Inference JumpStart Model Example +- SDK Inference FSX Model Example +- SDK Inference S3 Model Example + +::: + +:::: diff --git a/doc/index.md b/doc/index.md index d53ea5f3..57f38c65 100644 --- a/doc/index.md +++ b/doc/index.md @@ -23,33 +23,16 @@ CLI Reference Example Notebooks API reference <_apidoc/modules> ``` -:::::{grid} 2 -:gutter: 3 -:margin: 0 -::::{grid-item} -:columns: 8 Amazon Hyperpod helps you provision and manage resilient clusters optimized for large-scale machine learning (ML) workloads, including large language models (LLMs), diffusion models, and foundation models (FMs). To get started with Hyperpod, visit the [AWS Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html). -### What is SageMaker HyperPod CLI and SDK? -Amazon SageMaker HyperPod CLI and SDK are developer tools designed to simplify the management of distributed training workloads on dedicated, high-performance computing clusters. -:::: -::::{grid-item} -:columns: 4 ```{note} Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0. ``` -:::: -::::: -:::::{grid} 2 -:gutter: 3 -:margin: 0 -::::{grid-item} -:columns: 8 ### Why Choose HyperPod CLI & SDK? Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Weather it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: @@ -57,20 +40,20 @@ Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and - Automate ML workflows while maintaining operational visibility - Optimize computing resources across your AI/ML projects -:::: -::::{grid-item} -:columns: 4 + ```{admonition} What's New :class: important -**🚀 CLI and SDK!!** +🚀 We are excited to announce general availability of Amazon SageMaker HyperPod CLI and SDK! -Streamlined interfaces for Training, -Inference, and Cluster Monitoring. + +**Major Updates**: +- **Distributed Training**: Scale PyTorch jobs across multiple nodes and GPUs with simplified management and automatic fault tolerance. +- **Model Inference**: Deploy pre-trained models from SageMaker JumpStart and host custom auto-scaling inference endpoints. +- **Observability**: Connect to and manage multiple HyperPod clusters with enhanced monitoring capabilities. +- **Usability Improvements**: Intuitive CLI for quick experimentation and cluster management, granular SDK control over workload configurations and easy access to system logs and observability dashboards for efficient debugging ``` -:::: -::::: ## Quick Start From 238855bd6b02e50399438349e402f50220ce1c98 Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Mon, 4 Aug 2025 01:15:44 -0700 Subject: [PATCH 5/8] Documentation Fixes --- doc/conf.py | 2 +- doc/index.md | 4 ++-- doc/installation.md | 23 ----------------------- 3 files changed, 3 insertions(+), 26 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index c1398ba7..6e2c4ba7 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -35,7 +35,7 @@ def run_apidoc(app): cmd = [ "--separate", "--module-first", - "--doc-project=API Reference", + "--doc-project=SDK API Reference", "-o", output_dir, module_dir, diff --git a/doc/index.md b/doc/index.md index 57f38c65..bcea943a 100644 --- a/doc/index.md +++ b/doc/index.md @@ -21,7 +21,7 @@ Training Inference CLI Reference Example Notebooks -API reference <_apidoc/modules> +SDK reference <_apidoc/modules> ``` @@ -35,7 +35,7 @@ Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI ### Why Choose HyperPod CLI & SDK? -Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Weather it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: +Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Whether it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: - Accelerate development cycles and reduce operational overhead - Automate ML workflows while maintaining operational visibility - Optimize computing resources across your AI/ML projects diff --git a/doc/installation.md b/doc/installation.md index f9dbf85c..77992f9a 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -60,26 +60,3 @@ To verify that the installation was successful, run: # Verify CLI installation hyp --help ``` - -### Install from GitHub - -For the latest development version or to contribute to the project, you can install directly from the GitHub repository: - -**Clone the SageMaker HyperPod CLI package from GitHub:** -```bash -git clone https://github.com/aws/sagemaker-hyperpod-cli.git -``` - -**Install the SageMaker HyperPod CLI:** -```bash -cd sagemaker-hyperpod-cli && pip install . -``` - -**Test if the SageMaker HyperPod CLI is successfully installed by running the following command:** -```bash -hyp --help -``` - -```{note} -The GitHub installation provides access to the latest features and bug fixes that may not yet be available in the PyPI release. However, it may be less stable than the official PyPI release. -``` \ No newline at end of file From 90205f5ee96d8a02eaab137cd78e46855304c977 Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Mon, 4 Aug 2025 23:39:39 -0700 Subject: [PATCH 6/8] Documentation Fixes --- doc/advanced_resources.md | 52 +++++++++++++++++++++++++++++++++++++++ doc/api/api_index.rst | 12 +++++++++ doc/api/inference_api.rst | 15 +++++++++++ doc/api/training_api.rst | 17 +++++++++++++ doc/cli_inference.md | 2 +- doc/cli_training.md | 2 +- doc/conf.py | 26 +++++++++++++++----- doc/getting_started.md | 9 +++++++ doc/index.md | 44 +++++++++++++++++++++------------ 9 files changed, 155 insertions(+), 24 deletions(-) create mode 100644 doc/advanced_resources.md create mode 100644 doc/api/api_index.rst create mode 100644 doc/api/inference_api.rst create mode 100644 doc/api/training_api.rst diff --git a/doc/advanced_resources.md b/doc/advanced_resources.md new file mode 100644 index 00000000..c463fb0a --- /dev/null +++ b/doc/advanced_resources.md @@ -0,0 +1,52 @@ +(advanced_resources)= + +# Advanced Resources + +```{toctree} +:hidden: +:maxdepth: 2 + +examples +AWS SageMaker HyperPod Docs +HyperPod Developer Guide +SageMaker HyperPod Workshop + +``` + +## Advanced Resources + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} Github +:link: examples +:link-type: ref +:class-card: sd-border-secondary + +**Example Notebooks** - Ready-to-use implementation guides +::: + +:::{grid-item-card} AWS SageMaker HyperPod Docs +:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:link-type: url +:class-card: sd-border-secondary + +**HyperPod Documentation** - Know more about HyperPod +::: + +:::{grid-item-card} HyperPod Developer Guide +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:class-card: sd-border-secondary + +**Developer Guide** - Refer to this practical development guide +::: + +:::{grid-item-card} SageMaker HyperPod Workshop +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:class-card: sd-border-secondary + +**Practical Guide** - Refer to the workshop for detailed follow-through steps +::: + + +:::: diff --git a/doc/api/api_index.rst b/doc/api/api_index.rst new file mode 100644 index 00000000..a5991817 --- /dev/null +++ b/doc/api/api_index.rst @@ -0,0 +1,12 @@ +############# +SDK Reference +############# + +.. toctree:: + :maxdepth: 1 + + training_api + inference_api + + + diff --git a/doc/api/inference_api.rst b/doc/api/inference_api.rst new file mode 100644 index 00000000..367d5958 --- /dev/null +++ b/doc/api/inference_api.rst @@ -0,0 +1,15 @@ +Inference +------------ + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint + :members: + :undoc-members: + :show-inheritance: + +HPEndpoint Configs +------------ + +.. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/training_api.rst b/doc/api/training_api.rst new file mode 100644 index 00000000..1e1580ef --- /dev/null +++ b/doc/api/training_api.rst @@ -0,0 +1,17 @@ +Training +------------ + +.. automodule:: sagemaker.hyperpod.training.hyperpod_pytorch_job + :members: + :undoc-members: + :show-inheritance: + +HyperPodPytorchJob Configs +------------ + +.. automodule:: sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config + :members: + :undoc-members: + :show-inheritance: + + diff --git a/doc/cli_inference.md b/doc/cli_inference.md index 0917f2d3..77be2626 100644 --- a/doc/cli_inference.md +++ b/doc/cli_inference.md @@ -1,6 +1,6 @@ (cli_inference)= -# Inference CLI +# Inference Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. diff --git a/doc/cli_training.md b/doc/cli_training.md index 70b4bdc0..f34123a8 100644 --- a/doc/cli_training.md +++ b/doc/cli_training.md @@ -1,6 +1,6 @@ (cli_training)= -# Training CLI +# Training Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. diff --git a/doc/conf.py b/doc/conf.py index 6e2c4ba7..be0dbe34 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -55,7 +55,7 @@ def run_apidoc(app): def setup(app): """Register our sphinx hooks.""" - app.connect("builder-inited", run_apidoc) + #app.connect("builder-inited", run_apidoc) # Get version from setup.py @@ -99,23 +99,27 @@ def get_version(): "myst_nb", "sphinx_design", "sphinx_tabs.tabs", - "sphinx_copybutton" + "sphinx_copybutton", + "sphinx.ext.autosummary", + "sphinx.ext.autosectionlabel", ] +''' # Mock modules that might not be available during documentation build autodoc_mock_imports = [ 'sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config', - 'hyperpod_pytorch_job_template.registry' -] + 'hyperpod_pytorch_job_template.registry', +]''' +autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j"] source_suffix = { '.rst': 'restructuredtext', '.ipynb': 'myst-nb', '.md': 'myst-nb', } -master_doc = "index" -autoclass_content = "class" +autoclass_content = "both" +autodoc_default_flags = ["show-inheritance", "members", "undoc-members"] autodoc_member_order = "bysource" default_role = "py:obj" @@ -173,3 +177,13 @@ def get_version(): myst_substitutions = { "version": version, } + +# Automatically extract typehints when specified and place them in +# descriptions of the relevant function/method. +autodoc_typehints = "description" + +# autosummary +autosummary_generate = True + +# autosectionlabel +autosectionlabel_prefix_document = True \ No newline at end of file diff --git a/doc/getting_started.md b/doc/getting_started.md index e0261870..a7b34103 100644 --- a/doc/getting_started.md +++ b/doc/getting_started.md @@ -2,6 +2,15 @@ # Getting Started +```{toctree} +:hidden: +:maxdepth: 1 + +Training +Inference + +``` + This guide will help you get started with the SageMaker HyperPod CLI and SDK to perform basic operations. ## List Available Clusters diff --git a/doc/index.md b/doc/index.md index bcea943a..48f04c29 100644 --- a/doc/index.md +++ b/doc/index.md @@ -17,29 +17,19 @@ keywords: Installation Getting Started -Training -Inference CLI Reference -Example Notebooks -SDK reference <_apidoc/modules> +SDK reference +Advanced Resources ``` - -Amazon Hyperpod helps you provision and manage resilient clusters optimized for large-scale machine learning (ML) workloads, including large language models (LLMs), diffusion models, and foundation models (FMs). -To get started with Hyperpod, visit the [AWS Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html). - - -```{note} -Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0. -``` - -### Why Choose HyperPod CLI & SDK? - Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Whether it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: - Accelerate development cycles and reduce operational overhead - Automate ML workflows while maintaining operational visibility - Optimize computing resources across your AI/ML projects +```{note} +Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0. +``` ```{admonition} What's New :class: important @@ -101,7 +91,7 @@ Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and :gutter: 3 :::{grid-item-card} API reference -:link: _apidoc/modules.html +:link: api/api_index.html :class-card: sd-border-primary **Explore APIs** - Checkout API Documentation @@ -115,4 +105,26 @@ Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and **Example Notebooks** - Ready-to-use implementation guides ::: +:::{grid-item-card} AWS SageMaker HyperPod Docs +:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:class-card: sd-border-secondary + +**HyperPod Documentation** - Know more about HyperPod +::: + +:::{grid-item-card} HyperPod Developer Guide +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:class-card: sd-border-secondary + +**Developer Guide** - Refer to this practical development guide +::: + +:::{grid-item-card} SageMaker HyperPod Workshop +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:class-card: sd-border-secondary + +**Practical Guide** - Refer to the workshop for detailed follow-through steps +::: + + :::: From 85446cdf658445d6a72162dcb126f0f3676bf07e Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Tue, 5 Aug 2025 02:09:10 -0700 Subject: [PATCH 7/8] Documentation Fixes --- doc/api/api_index.rst | 27 +++++++++-- doc/api/inference/hp_endpoint.rst | 45 +++++++++++++++++++ doc/api/inference_api.rst | 15 ------- doc/api/metadata.rst | 7 +++ .../hyperpod_pytorch_job.rst} | 13 ++++-- doc/cli_inference.md | 22 ++++++++- doc/cli_training.md | 9 +++- doc/conf.py | 42 ++--------------- doc/index.md | 3 ++ doc/installation.md | 2 +- 10 files changed, 122 insertions(+), 63 deletions(-) create mode 100644 doc/api/inference/hp_endpoint.rst delete mode 100644 doc/api/inference_api.rst create mode 100644 doc/api/metadata.rst rename doc/api/{training_api.rst => training/hyperpod_pytorch_job.rst} (68%) diff --git a/doc/api/api_index.rst b/doc/api/api_index.rst index a5991817..b5d37197 100644 --- a/doc/api/api_index.rst +++ b/doc/api/api_index.rst @@ -3,10 +3,31 @@ SDK Reference ############# .. toctree:: - :maxdepth: 1 + :hidden: + :maxdepth: 2 - training_api - inference_api + training/hyperpod_pytorch_job + inference/hp_endpoint +Complete reference for the SageMaker HyperPod SDK. + +.. container:: + + .. grid:: 1 1 3 3 + :gutter: 3 + + .. grid-item-card:: Training SDK + :link: training/hyperpod_pytorch_job + :link-type: doc + :class-card: sd-border-secondary + + Training SDK classes, methods and parameters. + + .. grid-item-card:: Inference SDK + :link: inference/hp_endpoint + :link-type: doc + :class-card: sd-border-secondary + + Inference SDK classes, methods and parameters. diff --git a/doc/api/inference/hp_endpoint.rst b/doc/api/inference/hp_endpoint.rst new file mode 100644 index 00000000..53afbad0 --- /dev/null +++ b/doc/api/inference/hp_endpoint.rst @@ -0,0 +1,45 @@ +Inference +=========== + +* `HPEndpointBase`_ +* `HPEndpoint`_ +* `HPJumpStartEndpoint`_ +* `HPEndpoint Configs`_ + + +HPEndpointBase +------------------- + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint_base + :members: + :undoc-members: + :show-inheritance: + +HPEndpoint +------------------- + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint + :members: + :undoc-members: + :show-inheritance: + +HPJumpStartEndpoint +--------------------- + +.. automodule:: sagemaker.hyperpod.inference.hp_jumpstart_endpoint + :members: + :undoc-members: + :show-inheritance: + +HPEndpoint Configs +------------------- + +.. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/inference_api.rst b/doc/api/inference_api.rst deleted file mode 100644 index 367d5958..00000000 --- a/doc/api/inference_api.rst +++ /dev/null @@ -1,15 +0,0 @@ -Inference ------------- - -.. automodule:: sagemaker.hyperpod.inference.hp_endpoint - :members: - :undoc-members: - :show-inheritance: - -HPEndpoint Configs ------------- - -.. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/metadata.rst b/doc/api/metadata.rst new file mode 100644 index 00000000..6ae5472d --- /dev/null +++ b/doc/api/metadata.rst @@ -0,0 +1,7 @@ +Metadata +------------ + +.. automodule:: sagemaker.hyperpod.common.config.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/training_api.rst b/doc/api/training/hyperpod_pytorch_job.rst similarity index 68% rename from doc/api/training_api.rst rename to doc/api/training/hyperpod_pytorch_job.rst index 1e1580ef..6a33dddd 100644 --- a/doc/api/training_api.rst +++ b/doc/api/training/hyperpod_pytorch_job.rst @@ -1,17 +1,24 @@ Training ------------- +=========== + +* `HyperPodPytorchJob`_ +* `HyperPodPytorchJob Configs`_ + + +HyperPodPytorchJob +------------------- .. automodule:: sagemaker.hyperpod.training.hyperpod_pytorch_job :members: :undoc-members: :show-inheritance: + HyperPodPytorchJob Configs ------------- +--------------------------- .. automodule:: sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config :members: :undoc-members: :show-inheritance: - diff --git a/doc/cli_inference.md b/doc/cli_inference.md index 3316de7c..1c79a706 100644 --- a/doc/cli_inference.md +++ b/doc/cli_inference.md @@ -2,8 +2,28 @@ # Inference +Complete reference for SageMaker HyperPod inference parameters and configuration options. + +* [Create JumpStart Endpoint](#hyp-create-hyp-jumpstart-endpoint) +* [Create Custom Endpoint](#hyp-create-hyp-custom-endpoint) + +* [List JumpStart Endpoints](#hyp-list-hyp-jumpstart-endpoint) +* [List Custom Endpoints](#hyp-list-hyp-custom-endpoint) +* [Describe JumpStart Endpoint](#hyp-describe-hyp-jumpstart-endpoint) +* [Describe Custom Endpoint](#hyp-describe-hyp-custom-endpoint) +* [Invoke JumpStart Endpoint](#hyp-invoke-hyp-jumpstart-endpoint) +* [Invoke Custom Endpoint](#hyp-invoke-hyp-custom-endpoint) +* [Delete JumpStart Endpoint](#hyp-delete-hyp-jumpstart-endpoint) +* [Delete Custom Endpoint](#hyp-delete-hyp-custom-endpoint) + +* [List JumpStart Pods](#hyp-list-pods-hyp-jumpstart-endpoint) +* [List Custom Pods](#hyp-list-pods-hyp-custom-endpoint) +* [Get JumpStart Logs](#hyp-get-logs-hyp-jumpstart-endpoint) +* [Get Custom Logs](#hyp-get-logs-hyp-custom-endpoint) +* [Get JumpStart Operator Logs](#hyp-get-operator-logs-hyp-jumpstart-endpoint) +* [Get Custom Operator Logs](#hyp-get-operator-logs-hyp-custom-endpoint) + -Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. ## hyp create hyp-jumpstart-endpoint diff --git a/doc/cli_training.md b/doc/cli_training.md index 717022e7..1d4520b7 100644 --- a/doc/cli_training.md +++ b/doc/cli_training.md @@ -3,9 +3,16 @@ # Training - Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. +* [Create PyTorch Job](#hyp-create-hyp-pytorch-job) +* [List Jobs](#hyp-list-hyp-pytorch-job) +* [Describe Job](#hyp-describe-hyp-pytorch-job) +* [Delete Job](#hyp-delete-hyp-pytorch-job) +* [List Pods](#hyp-list-pods-hyp-pytorch-job) +* [Get Logs](#hyp-get-logs-hyp-pytorch-job) + + ## hyp create hyp-pytorch-job Create distributed PyTorch training jobs on SageMaker HyperPod clusters. diff --git a/doc/conf.py b/doc/conf.py index 3074b70b..897101d6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -22,42 +22,10 @@ from typing import Dict, List, Any, Optional -def run_apidoc(app): - """Generate doc stubs using sphinx-apidoc.""" - module_dir = os.path.join(app.srcdir, "../src/") - output_dir = os.path.join(app.srcdir, "_apidoc") - excludes = [] - - # Ensure that any stale apidoc files are cleaned up first. - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - - cmd = [ - "--separate", - "--module-first", - "--doc-project=SDK API Reference", - "-o", - output_dir, - module_dir, - ] - cmd.extend(excludes) - - try: - from sphinx.ext import apidoc # Sphinx >= 1.7 - - apidoc.main(cmd) - except ImportError: - from sphinx import apidoc # Sphinx < 1.7 - - cmd.insert(0, apidoc.__file__) - apidoc.main(cmd) - def setup(app): """Register our sphinx hooks.""" - #app.connect("builder-inited", run_apidoc) - # Get version from setup.py def get_version(): @@ -105,12 +73,6 @@ def get_version(): "sphinx.ext.autosectionlabel", ] -''' -# Mock modules that might not be available during documentation build -autodoc_mock_imports = [ - 'sagemaker.hyperpod.training.config.hyperpod_pytorch_job_config', - 'hyperpod_pytorch_job_template.registry', -]''' autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j"] source_suffix = { @@ -150,7 +112,9 @@ def get_version(): htmlhelp_basename = "{}doc".format(project) html_static_path = ["_static"] -html_css_files = ["custom.css"] +html_css_files = ["custom.css", + "search_accessories.css", + ] napoleon_use_rtype = False # nbsphinx configuration diff --git a/doc/index.md b/doc/index.md index acac9065..8551d445 100644 --- a/doc/index.md +++ b/doc/index.md @@ -109,6 +109,7 @@ Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI :::{grid-item-card} AWS SageMaker HyperPod Docs :link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:link-type: url :class-card: sd-border-secondary **HyperPod Documentation** - Know more about HyperPod @@ -116,6 +117,7 @@ Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI :::{grid-item-card} HyperPod Developer Guide :link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url :class-card: sd-border-secondary **Developer Guide** - Refer to this practical development guide @@ -123,6 +125,7 @@ Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI :::{grid-item-card} SageMaker HyperPod Workshop :link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url :class-card: sd-border-secondary **Practical Guide** - Refer to the workshop for detailed follow-through steps diff --git a/doc/installation.md b/doc/installation.md index 77992f9a..041fae08 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -1,5 +1,5 @@ (installation)= -# Get Started +# Installation This guide provides installation instructions for the SageMaker HyperPod CLI and SDK. ## System Requirements From 5b72d3ab235bb7b2e9d547b749dca0ab795bf6aa Mon Sep 17 00:00:00 2001 From: Roja Reddy Sareddy Date: Tue, 5 Aug 2025 16:22:52 -0700 Subject: [PATCH 8/8] Documentation branch conflict Fixes --- .../health-monitoring-agent/values.yaml | 32 ++++++++++++++++++- helm_chart/readme.md | 26 +++++++-------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index 6622f1cf..c54910e1 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -1,2 +1,32 @@ namespace: "aws-hyperpod" -hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0" \ No newline at end of file + +# AWS region for the health monitoring agent ECR image +# The chart automatically detects the region from Kubernetes cluster context. +# Only specify this if you want to override the automatic detection. +# +# Automatic detection priority: +# 1. This explicit region setting (highest priority) +# 2. Global region setting (global.region) +# 3. Kubernetes cluster context detection: +# - EKS API server URL patterns +# - Node topology labels (topology.kubernetes.io/region) +# - AWS provider IDs in node specifications +# - Legacy region labels (failure-domain.beta.kubernetes.io/region) +# 4. Default fallback: us-west-2 +# +# Supported regions: us-east-1, us-west-2, us-east-2, us-west-1, eu-central-1, +# eu-north-1, eu-west-1, eu-west-2, ap-northeast-1, ap-south-1, ap-southeast-1, +# ap-southeast-2, sa-east-1 +region: "" + +# Image tag for health monitoring agent +# If not specified, uses global.imageTag or defaults to hardcoded version +imageTag: "" + +# Override the health monitoring agent image URI +# If specified, this will override the automatic region-based URI selection +# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0" +hmaimage: "" + +# Enable debug output for region selection process +debug: true \ No newline at end of file diff --git a/helm_chart/readme.md b/helm_chart/readme.md index 2b6fe6e5..b6a47b48 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -171,19 +171,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system - Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases - If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI. ``` - IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 ``` ## 7. Troubleshooting