Merge remote-tracking branch 'origin/local-telemetry-agent' into obj-…

…store-otel-export
run-house · Sep 11, 2024 · aff0d6f · aff0d6f
2 parents 1e3e54a + 2f52ff7
commit aff0d6f
Show file tree

Hide file tree

Showing 73 changed files with 1,136 additions and 4,754 deletions.
diff --git a/.github/workflows/local_den_unit_tests.yaml b/.github/workflows/local_den_unit_tests.yaml
@@ -2,10 +2,6 @@ name: Local Den Unit Tests
 
 on: workflow_dispatch
 
-env:
-  TELEMETRY_COLLECTOR_ENDPOINT: localhost:4316
-  TELEMETRY_COLLECTOR_STATUS_URL: http://localhost:13134
-
 jobs:
   local-den-tests:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/local_tests.yaml b/.github/workflows/local_tests.yaml
@@ -8,8 +8,6 @@ on:
 env:
   API_SERVER_URL: https://api.run.house
   RH_LOG_LEVEL: INFO
-  TELEMETRY_COLLECTOR_ENDPOINT: localhost:4316
-  TELEMETRY_COLLECTOR_STATUS_URL: http://localhost:13134
 
 jobs:
   # TODO: THESE ARE ONLY SEPARATE JOBS BECAUSE THERE ARE

diff --git a/.github/workflows/local_tests_den_dev.yaml b/.github/workflows/local_tests_den_dev.yaml
@@ -5,8 +5,6 @@ on:
 
 env:
   API_SERVER_URL: https://api-dev.run.house
-  TELEMETRY_COLLECTOR_ENDPOINT: localhost:4316
-  TELEMETRY_COLLECTOR_STATUS_URL: http://localhost:13134
 
 jobs:
   # TODO: THESE ARE ONLY SEPARATE JOBS BECAUSE THERE ARE

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1 @@
 include runhouse/builtins/*
-include runhouse/resources/hardware/sagemaker/*
diff --git a/README.md b/README.md
@@ -110,7 +110,6 @@ Please reach out (first name at run.house) if you don't see your favorite comput
   - Amazon Web Services (AWS)
     - EC2 - **Supported**
     - EKS - **Supported**
-    - SageMaker - **Supported**
     - Lambda - **Alpha**
   - Google Cloud Platform (GCP)
     - GCE - **Supported**

diff --git a/docker/telemetry-collector/Dockerfile b/docker/telemetry-collector/Dockerfile
@@ -1,4 +1,4 @@
-FROM otel/opentelemetry-collector-contrib:latest
+FROM otel/opentelemetry-collector-contrib:0.108.0
 
 # Define an argument for the config file path
 ARG CONFIG_FILE=otel-collector-config.yaml

diff --git a/docs/api/python/cluster.rst b/docs/api/python/cluster.rst
@@ -1,9 +1,9 @@
 Cluster
 =======
 A Cluster is a Runhouse primitive used for abstracting a particular hardware configuration.
-This can be either an :ref:`on-demand cluster <OnDemandCluster Class>` (requires valid cloud credentials), a
-:ref:`BYO (bring-your-own) cluster <Cluster Class>` (requires IP address and ssh creds), or a
-:ref:`SageMaker cluster <SageMakerCluster Class>` (requires an ARN role).
+This can be either an :ref:`on-demand cluster <OnDemandCluster Class>` (requires valid cloud credentials or a
+local Kube config if launching on Kubernetes), or a
+:ref:`BYO (bring-your-own) cluster <Cluster Class>` (requires IP address and ssh creds).
 
 A cluster is assigned a name, through which it can be accessed and reused later on.
 
@@ -14,8 +14,6 @@ Cluster Factory Methods
 
 .. autofunction:: runhouse.ondemand_cluster
 
-.. autofunction:: runhouse.sagemaker_cluster
-
 Cluster Class
 ~~~~~~~~~~~~~
 
@@ -75,141 +73,6 @@ See the `SkyPilot docs <https://skypilot.readthedocs.io/en/latest/cloud-setup/cl
 for more details on configuring a VPC.
 
 
-SageMakerCluster Class
-~~~~~~~~~~~~~~~~~~~~~~
-.. note::
-
-    SageMaker support is an alpha and under active development. Please report any bugs or let us know of any
-    feature requests.
-
-A SageMakerCluster is a cluster that uses a SageMaker instance under the hood.
-
-Runhouse currently supports two core usage paths for SageMaker clusters:
-
-- **Compute backend**: You can use SageMaker as a compute backend, just as you would a
-  :ref:`BYO (bring-your-own) <Cluster Class>` or an :ref:`on-demand cluster <OnDemandCluster Class>`.
-  Runhouse will handle launching the SageMaker compute and creating the SSH connection
-  to the cluster.
-
-- **Dedicated training jobs**: You can use a SageMakerCluster class to run a training job on SageMaker compute.
-  To do so, you will need to provide an
-  `estimator <https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html>`__.
-
-.. note::
-
-    Runhouse requires an AWS IAM role (either name or full ARN) whose credentials have adequate permissions to
-    create create SageMaker endpoints and access AWS resources.
-
-    Please see :ref:`SageMaker Hardware Setup` for more specific instructions and
-    requirements for providing the role and setting up the cluster.
-
-.. autoclass:: runhouse.SageMakerCluster
-   :members:
-   :exclude-members:
-
-    .. automethod:: __init__
-
-SageMaker Hardware Setup
-------------------------
-
-IAM Role
-^^^^^^^^
-
-SageMaker clusters require `AWS CLI V2 <https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html>`__ and
-configuring the SageMaker IAM role with the
-`AWS Systems Manager <https://docs.aws.amazon.com/systems-manager/latest/userguide/ssm-agent.html>`__.
-
-
-In order to launch a cluster, you must grant SageMaker the necessary permissions with an IAM role, which
-can be provided either by name or by full ARN. You can also specify a profile explicitly or
-with the :code:`AWS_PROFILE` environment variable.
-
-For example, let's say your local :code:`~/.aws/config` file contains:
-
-.. code-block:: ini
-
-    [profile sagemaker]
-    role_arn = arn:aws:iam::123456789:role/service-role/AmazonSageMaker-ExecutionRole-20230717T192142
-    region = us-east-1
-    source_profile = default
-
-There are several ways to provide the necessary credentials when :ref:`initializing the cluster <Cluster Factory Methods>`:
-
-- Providing the AWS profile name: :code:`profile="sagemaker"`
-- Providing the AWS Role ARN directly: :code:`role="arn:aws:iam::123456789:role/service-role/AmazonSageMaker-ExecutionRole-20230717T192142"`
-- Environment Variable: setting :code:`AWS_PROFILE` to :code:`"sagemaker"`
-
-.. note::
-
-    If no role or profile is provided, Runhouse will try using the :code:`default` profile. Note if this default AWS
-    identity is not a role, then you will need to provide the :code:`role` or :code:`profile` explicitly.
-
-.. tip::
-
-    If you are providing an estimator, you must provide the role ARN explicitly as part of the estimator object.
-    More info on estimators `here <https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html>`__.
-
-Please see the `AWS docs <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html>`__ for further
-instructions on creating and configuring an ARN Role.
-
-
-AWS CLI V2
-^^^^^^^^^^
-
-The SageMaker SDK uses AWS CLI V2, which must be installed on your local machine. Doing so requires one of two steps:
-
-- `Migrate from V1 to V2 <https://docs.aws.amazon.com/cli/latest/userguide/cliv2-migration-instructions.html#cliv2-migration-instructions-migrate>`_
-
-- `Install V2 <https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html>`_
-
-
-To confirm the installation succeeded, run ``aws --version`` in the command line. You should see something like:
-
-.. code-block::
-
-    $ aws-cli/2.13.8 Python/3.11.4 Darwin/21.3.0 source/arm64 prompt/off
-
-If you are still seeing the V1 version, first try uninstalling V1 in case it is still present
-(e.g. ``pip uninstall awscli``).
-
-You may also need to add the V2 executable to the PATH of your python environment. For example, if you are using conda,
-it’s possible the conda env will try using its own version of the AWS CLI located at a different
-path (e.g. ``/opt/homebrew/anaconda3/bin/aws``), while the system wide installation of AWS CLI is located somewhere
-else (e.g. ``/opt/homebrew/bin/aws``).
-
-To find the global AWS CLI path:
-
-.. code-block::
-
-    $ which aws
-
-To ensure that the global AWS CLI version is used within your python environment, you’ll need to adjust the
-PATH environment variable so that it prioritizes the global AWS CLI path.
-
-.. code-block::
-
-    $ export PATH=/opt/homebrew/bin:$PATH
-
-
-SSM Setup
-^^^^^^^^^
-The AWS Systems Manager service is used to create SSH tunnels with the SageMaker cluster.
-
-To install the AWS Session Manager Plugin, please see the `AWS docs <https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html>`_
-or `SageMaker SSH Helper <https://github.com/aws-samples/sagemaker-ssh-helper#step-4-connect-over-ssm>`__. The SSH Helper package
-simplifies the process of creating SSH tunnels with SageMaker clusters. It is installed by default if
-you are installing Runhouse with the SageMaker dependency: :code:`pip install runhouse[sagemaker]`.
-
-You can also install the Session Manager by running the CLI command:
-
-.. code-block::
-
-    $ sm-local-configure
-
-To configure your SageMaker IAM role with the AWS Systems Manager, please
-refer to `these instructions <https://github.com/aws-samples/sagemaker-ssh-helper/blob/main/IAM_SSM_Setup.md>`__.
-
-
 Cluster Authentication & Verification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Runhouse provides a couple of options to manage the connection to the Runhouse API server running on a cluster.
@@ -228,10 +91,6 @@ be started on the cluster on port :code:`32300`.
 - ``none``: Does not use any port forwarding or enforce any authentication. Connects to the cluster with HTTP by
   default on port :code:`80`. This is useful when connecting to a cluster within a VPC, or creating a tunnel manually
   on the side with custom settings.
-- ``aws_ssm``: Uses the
-  `AWS Systems Manager <https://docs.aws.amazon.com/systems-manager/latest/userguide/what-is-systems-manager.html>`__ to
-  create an SSH tunnel to the cluster, by default on port :code:`32300`. *Note: this is currently only relevant
-  for SageMaker Clusters.*
 
 
 .. note::

diff --git a/docs/docker-setup.rst b/docs/docker-setup.rst
@@ -17,8 +17,7 @@ is automatically built and set up remotely on the cluster. The Runhouse
 server will start directly inside the remote container.
 
 **NOTE:** This guide details the setup and usage for on-demand clusters
-only. Docker container is also supported for Sagemaker clusters, and it
-is not yet supported for static clusters.
+only. It is not yet supported for static clusters.
 
 Cluster & Docker Setup
 ----------------------

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -4,7 +4,6 @@ pint==0.20.1
 pyarrow==9.0.0
 pydata-sphinx-theme==0.13.3
 ray>=2.2.0
-sagemaker
 sentry-sdk==1.28.1
 sphinx-book-theme==1.0.1
 sphinx-click==4.3.0

diff --git a/docs/tutorials/api-clusters.rst b/docs/tutorials/api-clusters.rst
@@ -95,7 +95,7 @@ remotely on your AWS instance.
 
 
 On-Demand Clusters within Existing Cloud VPC
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 If you would like to launch on-demand clusters using existing VPCs,
 you can easily set it up by configuring SkyPilot. Without setting VPC,
 we launch in the default VPC in the region of the cluster. If you do

diff --git a/docs/tutorials/api-resources.rst b/docs/tutorials/api-resources.rst
@@ -276,16 +276,16 @@ to notify them.
 
     INFO | 2024-08-18 06:51:39.797150 | Saving config for aws-cpu-ssh-secret to Den
     INFO | 2024-08-18 06:51:39.972763 | Saving secrets for aws-cpu-ssh-secret to Vault
-    INFO | 2024-08-18 06:51:40.190996 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu_default_env', 'resource_type': 'env', 'resource_subtype': 'Env', 'provenance': None, 'visibility': 'private', 'env_vars': {}, 'env_name': 'aws-cpu_default_env', 'compute': {}, 'reqs': ['ray==2.30.0'], 'working_dir': None}
-    INFO | 2024-08-18 06:51:40.368442 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu', 'resource_type': 'cluster', 'resource_subtype': 'OnDemandCluster', 'provenance': None, 'visibility': 'private', 'ips': ['3.14.144.103'], 'server_port': 32300, 'server_connection_type': 'ssh', 'den_auth': False, 'ssh_port': 22, 'client_port': 32300, 'creds': '/jlewitt1/aws-cpu-ssh-secret', 'api_server_url': 'https://api.run.house', 'default_env': '/jlewitt1/aws-cpu_default_env', 'instance_type': 'CPU:2+', 'provider': 'aws', 'open_ports': [], 'use_spot': False, 'image_id': 'docker:nvcr.io/nvidia/pytorch:23.10-py3', 'region': 'us-east-2', 'stable_internal_external_ips': [('172.31.5.134', '3.14.144.103')], 'sky_kwargs': {'launch': {'retry_until_up': True}}, 'launched_properties': {'cloud': 'aws', 'instance_type': 'm6i.large', 'region': 'us-east-2', 'cost_per_hour': 0.096, 'docker_user': 'root'}, 'autostop_mins': -1}
+    INFO | 2024-08-18 06:51:40.190996 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu_default_env', 'resource_type': 'env', 'resource_subtype': 'Env', 'visibility': 'private', 'env_vars': {}, 'env_name': 'aws-cpu_default_env', 'compute': {}, 'reqs': ['ray==2.30.0'], 'working_dir': None}
+    INFO | 2024-08-18 06:51:40.368442 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu', 'resource_type': 'cluster', 'resource_subtype': 'OnDemandCluster', 'visibility': 'private', 'ips': ['3.14.144.103'], 'server_port': 32300, 'server_connection_type': 'ssh', 'den_auth': False, 'ssh_port': 22, 'client_port': 32300, 'creds': '/jlewitt1/aws-cpu-ssh-secret', 'api_server_url': 'https://api.run.house', 'default_env': '/jlewitt1/aws-cpu_default_env', 'instance_type': 'CPU:2+', 'provider': 'aws', 'open_ports': [], 'use_spot': False, 'image_id': 'docker:nvcr.io/nvidia/pytorch:23.10-py3', 'region': 'us-east-2', 'stable_internal_external_ips': [('172.31.5.134', '3.14.144.103')], 'sky_kwargs': {'launch': {'retry_until_up': True}}, 'launched_properties': {'cloud': 'aws', 'instance_type': 'm6i.large', 'region': 'us-east-2', 'cost_per_hour': 0.096, 'docker_user': 'root'}, 'autostop_mins': -1}
     INFO | 2024-08-18 06:51:40.548233 | Sharing cluster credentials, which enables the recipient to SSH into the cluster.
     INFO | 2024-08-18 06:51:40.551277 | Saving config for aws-cpu-ssh-secret to Den
     INFO | 2024-08-18 06:51:40.728345 | Saving secrets for aws-cpu-ssh-secret to Vault
-    INFO | 2024-08-18 06:51:41.150745 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu_default_env', 'resource_type': 'env', 'resource_subtype': 'Env', 'provenance': None, 'visibility': 'private', 'env_vars': {}, 'env_name': 'aws-cpu_default_env', 'compute': {}, 'reqs': ['ray==2.30.0'], 'working_dir': None}
+    INFO | 2024-08-18 06:51:41.150745 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu_default_env', 'resource_type': 'env', 'resource_subtype': 'Env', 'visibility': 'private', 'env_vars': {}, 'env_name': 'aws-cpu_default_env', 'compute': {}, 'reqs': ['ray==2.30.0'], 'working_dir': None}
     INFO | 2024-08-18 06:51:42.006030 | Saving config for aws-cpu-ssh-secret to Den
     INFO | 2024-08-18 06:51:42.504070 | Saving secrets for aws-cpu-ssh-secret to Vault
-    INFO | 2024-08-18 06:51:42.728653 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu_default_env', 'resource_type': 'env', 'resource_subtype': 'Env', 'provenance': None, 'visibility': 'private', 'env_vars': {}, 'env_name': 'aws-cpu_default_env', 'compute': {}, 'reqs': ['ray==2.30.0'], 'working_dir': None}
-    INFO | 2024-08-18 06:51:42.906615 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu', 'resource_type': 'cluster', 'resource_subtype': 'OnDemandCluster', 'provenance': None, 'visibility': 'private', 'ips': ['3.14.144.103'], 'server_port': 32300, 'server_connection_type': 'ssh', 'den_auth': False, 'ssh_port': 22, 'client_port': 32300, 'creds': '/jlewitt1/aws-cpu-ssh-secret', 'api_server_url': 'https://api.run.house', 'default_env': '/jlewitt1/aws-cpu_default_env', 'instance_type': 'CPU:2+', 'provider': 'aws', 'open_ports': [], 'use_spot': False, 'image_id': 'docker:nvcr.io/nvidia/pytorch:23.10-py3', 'region': 'us-east-2', 'stable_internal_external_ips': [('172.31.5.134', '3.14.144.103')], 'sky_kwargs': {'launch': {'retry_until_up': True}}, 'launched_properties': {'cloud': 'aws', 'instance_type': 'm6i.large', 'region': 'us-east-2', 'cost_per_hour': 0.096, 'docker_user': 'root'}, 'autostop_mins': -1}
+    INFO | 2024-08-18 06:51:42.728653 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu_default_env', 'resource_type': 'env', 'resource_subtype': 'Env', 'visibility': 'private', 'env_vars': {}, 'env_name': 'aws-cpu_default_env', 'compute': {}, 'reqs': ['ray==2.30.0'], 'working_dir': None}
+    INFO | 2024-08-18 06:51:42.906615 | Saving config to RNS: {'name': '/jlewitt1/aws-cpu', 'resource_type': 'cluster', 'resource_subtype': 'OnDemandCluster', 'visibility': 'private', 'ips': ['3.14.144.103'], 'server_port': 32300, 'server_connection_type': 'ssh', 'den_auth': False, 'ssh_port': 22, 'client_port': 32300, 'creds': '/jlewitt1/aws-cpu-ssh-secret', 'api_server_url': 'https://api.run.house', 'default_env': '/jlewitt1/aws-cpu_default_env', 'instance_type': 'CPU:2+', 'provider': 'aws', 'open_ports': [], 'use_spot': False, 'image_id': 'docker:nvcr.io/nvidia/pytorch:23.10-py3', 'region': 'us-east-2', 'stable_internal_external_ips': [('172.31.5.134', '3.14.144.103')], 'sky_kwargs': {'launch': {'retry_until_up': True}}, 'launched_properties': {'cloud': 'aws', 'instance_type': 'm6i.large', 'region': 'us-east-2', 'cost_per_hour': 0.096, 'docker_user': 'root'}, 'autostop_mins': -1}
 
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,6 @@ pexpect
 opentelemetry-api
 opentelemetry-sdk
 opentelemetry-exporter-otlp
-opentelemetry-instrumentation-logging
 pyopenssl>=23.3.0
 ray[default] >= 2.9.0
 rich
@@ -16,3 +15,4 @@ wheel
 apispec
 httpx
 pydantic >=2.5.0
+pynvml
diff --git a/runhouse/__init__.py b/runhouse/__init__.py
@@ -1,5 +1,4 @@
 from runhouse.resources.asgi import Asgi, asgi
-from runhouse.resources.blobs import blob, Blob, file, File
 from runhouse.resources.envs import conda_env, CondaEnv, env, Env
 from runhouse.resources.folders import Folder, folder, GCSFolder, S3Folder
 from runhouse.resources.functions.aws_lambda import LambdaFunction
@@ -12,8 +11,6 @@
     kubernetes_cluster,
     ondemand_cluster,
     OnDemandCluster,
-    sagemaker_cluster,
-    SageMakerCluster,
 )
 
 # WARNING: Any built-in module that is imported here must be capitalized followed by all lowercase, or we will
@@ -26,7 +23,6 @@
     package,
     Package,
 )
-from runhouse.resources.provenance import capture_stdout, Run, run, RunStatus, RunType
 from runhouse.resources.resource import Resource
 from runhouse.resources.secrets import provider_secret, ProviderSecret, Secret, secret
 
@@ -63,4 +59,4 @@ def __getattr__(name):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-__version__ = "0.0.33"
+__version__ = "0.0.34"
diff --git a/runhouse/constants.py b/runhouse/constants.py
@@ -10,6 +10,7 @@
 LOCALHOST: str = "127.0.0.1"
 LOCAL_HOSTS: List[str] = ["localhost", LOCALHOST]
 TUNNEL_TIMEOUT = 5
+NUM_PORTS_TO_TRY = 10
 
 LOGS_DIR = ".rh/logs"
 RH_LOGFILE_PATH = Path.home() / LOGS_DIR
@@ -73,11 +74,23 @@
 # Constants for the status check
 DOUBLE_SPACE_UNICODE = "\u00A0\u00A0"
 BULLET_UNICODE = "\u2022"
+SECOND = 1
 MINUTE = 60
 HOUR = 3600
 DEFAULT_STATUS_CHECK_INTERVAL = 1 * MINUTE
 INCREASED_STATUS_CHECK_INTERVAL = 1 * HOUR
-STATUS_CHECK_DELAY = 1 * MINUTE
+GPU_COLLECTION_INTERVAL = 5 * SECOND
+
+# We collect gpu every GPU_COLLECTION_INTERVAL.
+# Meaning that in one minute we collect (MINUTE / GPU_COLLECTION_INTERVAL) gpu stats.
+# Currently, we save gpu info of the last 10 minutes or less.
+MAX_GPU_INFO_LEN = (MINUTE / GPU_COLLECTION_INTERVAL) * 10
+
+# If we just collect the gpu stats (and not send them to den), the gpu_info dictionary *will not* be reseted by the servlets.
+# Therefore, we need to cut the gpu_info size, so it doesn't consume too much cluster memory.
+# Currently, we reduce the size by half, meaning we only keep the gpu_info of the last (MAX_GPU_INFO_LEN / 2) minutes.
+REDUCED_GPU_INFO_LEN = MAX_GPU_INFO_LEN / 2
+
 
 # Constants Surfacing Logs to Den
 DEFAULT_LOG_SURFACING_INTERVAL = 2 * MINUTE
@@ -87,8 +100,6 @@
 INCREASED_INTERVAL = 1 * HOUR
 
 # Telemetry constants
-OTEL_VERSION = "0.108.0"
-
 TELEMETRY_AGENT_HTTP_PORT = 4318
 TELEMETRY_AGENT_GRPC_PORT = 4317
 TELEMETRY_AGENT_HEALTH_CHECK_PORT = 13133