run-ai · SherinDaher-Runai · Apr 7, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 10, 2025
diff --git a/docs/admin/runai-setup/self-hosted/bcm/files/metallb.txt b/docs/admin/runai-setup/self-hosted/bcm/files/metallb.txt
@@ -0,0 +1,29 @@
+
+    ---
+    apiVersion: metallb.io/v1beta1
+    kind: L2Advertisement
+    metadata:
+      name: l2-ingress
+      namespace: metallb-system
+    spec:
+      ipAddressPools:
+        - ingress-pool
+    nodeSelectors:
+        - matchLabels:
+            node-role.kubernetes.io/runai-system: "true"
+
+    ---
+    apiVersion: metallb.io/v1beta1
+    kind: IPAddressPool
+    metadata:
+      name: ingress-pool
+      namespace: metallb-system
+    spec:
+      addresses:
+        -   192.168.0.250-192.168.0.251 # Example of two ip address - 
+    autoAssign: false
+    serviceAllocation:
+        priority: 50
+        namespaces:
+        - ingress-nginx
+        - knative-serving
diff --git a/docs/admin/runai-setup/self-hosted/bcm/files/networkoperator.txt b/docs/admin/runai-setup/self-hosted/bcm/files/networkoperator.txt
@@ -0,0 +1,24 @@
+
+    deployCR: true
+    nfd:
+      enabled: true
+    ofedDriver:
+      deploy: false
+    psp:
+      enabled: false
+    rdmaSharedDevicePlugin:
+      deploy: false
+    secondaryNetwork:
+      cniPlugins:
+        deploy: true
+      deploy: true
+      ipamPlugin:
+        deploy: false
+      multus:
+        deploy: true
+    nvIpam:
+      deploy: true
+    sriovDevicePlugin:
+      deploy: false
+    sriovNetworkOperator:
+      enabled: true
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-1.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-1.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-10.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-10.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-11.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-11.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-12.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-12.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-13.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-13.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-14.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-14.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-15.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-15.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-16.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-16.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-17.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-17.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-18.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-18.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-19.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-19.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-2.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-2.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-20.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-20.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-21.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-21.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-22.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-22.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-23.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-23.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-24.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-24.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-25.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-25.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-26.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-26.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-27.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-27.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-28.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-28.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-29.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-29.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-3.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-3.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-30.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-30.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-31.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-31.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-32.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-32.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-33.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-33.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-34.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-34.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-35.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-35.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-36.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-36.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-37.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-37.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-38.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-38.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-4.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-4.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-40.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-40.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-5.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-5.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-6.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-6.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-7.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-7.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-8.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-8.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image-9.png b/docs/admin/runai-setup/self-hosted/bcm/images/image-9.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/images/image.png b/docs/admin/runai-setup/self-hosted/bcm/images/image.png
diff --git a/docs/admin/runai-setup/self-hosted/bcm/install-cluster.md b/docs/admin/runai-setup/self-hosted/bcm/install-cluster.md
@@ -0,0 +1,80 @@
+# Install the Cluster
+
+
+## System and Network Requirements 
+Before installing the NVIDIA Run:ai cluster, validate that the [system requirements](./system-requirements.md) and [network requirements](./network-requirements.md) are met. Make sure you have the [software artifacts](./preparations.md) prepared.
+
+Once all the requirements are met, it is highly recommend to use the NVIDIA Run:ai cluster preinstall diagnostics tool to:
+
+* Test the below requirements in addition to failure points related to Kubernetes, NVIDIA, storage, and networking
+* Look at additional components installed and analyze their relevance to a successful installation
+
+For more information, see [preinstall diagnostics](https://github.com/run-ai/preinstall-diagnostics). To run the preinstall diagnostics tool, [download](https://runai.jfrog.io/ui/native/pd-cli-prod/preinstall-diagnostics-cli/) the latest version, and run:
+
+```bash
+chmod +x ./preinstall-diagnostics-<platform> && \ 
+./preinstall-diagnostics-<platform> \
+  --domain ${CONTROL_PLANE_FQDN} \
+  --cluster-domain ${CLUSTER_FQDN} \
+#if the diagnostics image is hosted in a private registry
+  --image-pull-secret ${IMAGE_PULL_SECRET_NAME} \
+  --image ${PRIVATE_REGISTRY_IMAGE_URL}    
+```
+
+## Helm
+
+NVIDIA Run:ai requires [Helm](https://helm.sh/) 3.14 or later. To install Helm, see [Installing Helm](https://helm.sh/docs/intro/install/). 
+
+## Permissions 
+
+A Kubernetes user with the `cluster-admin` role is required to ensure a successful installation. For more information, see [Using RBAC authorization](https://kubernetes.io/docs/reference/access-authn-authz/rbac/).
+
+## Installation
+
+Follow the steps below to add a new cluster.
+
+!!! Note
+    When adding a cluster for the first time, the New Cluster form automatically opens when you log in to the NVIDIA Run:ai platform. Other actions are prevented, until the cluster is created.
+
+If this is your first cluster and you have completed the New Cluster form, start at step 3. Otherwise, start at step 1.
+
+1. In the NVIDIA Run:ai platform, go to **Resources**
+2. Click **+NEW CLUSTER**
+3. Enter a unique name for your cluster
+4. Choose the NVIDIA Run:ai cluster version (latest, by default)
+5. Select **Same as control plane**
+6. Click **Continue**
+
+**Installing NVIDIA Run:ai Cluster**
+
+In the next Section, the NVIDIA Run:ai cluster installation steps will be presented.
+
+1. Follow the installation instructions and run the commands provided on your Kubernetes cluster
+2. Append `--set global.customCA.enabled=true` to the Helm installation command
+3. Click **DONE**
+
+The cluster is displayed in the table with the status **Waiting to connect**. Once installation is complete, the cluster status changes to **Connected**.
+
+!!! Tip
+    Use the `--dry-run` flag to gain an understanding of what is being installed before the actual installation. For more details, see see [Understanding cluster access roles.](https://docs.run.ai/v2.19/admin/config/access-roles/).
+
+
+!!! Note
+    To customize the installation based on your environment, see [Customize cluster installation](../../cluster-setup/customize-cluster-install.md).
+
+## Troubleshooting
+
+If you encounter an issue with the installation, try the troubleshooting scenario below.
+
+### Installation
+
+If the NVIDIA Run:ai cluster installation failed, check the installation logs to identify the issue. Run the following script to print the installation logs:
+
+``` bash
+curl -fsSL https://raw.githubusercontent.com/run-ai/public/main/installation/get-installation-logs.sh
+```
+
+### Cluster Status
+
+If the NVIDIA Run:ai cluster installation completed, but the cluster status did not change its status to Connected, check the cluster [troubleshooting scenarios](../../troubleshooting/troubleshooting.md#cluster-health)
+
diff --git a/docs/admin/runai-setup/self-hosted/bcm/install-control-plane.md b/docs/admin/runai-setup/self-hosted/bcm/install-control-plane.md
@@ -0,0 +1,43 @@
+# Install the Control Plane
+
+Installing the NVIDIA Run:ai control plane requires Internet connectivity.
+
+
+## System and Network Requirements
+Before installing the NVIDIA Run:ai control plane, validate that the [system requirements](./system-requirements.md) and [network requirements](./network-requirements.md) are met. Make sure you have the [software artifacts](./preparations.md) prepared.
+
+## Permissions
+
+As part of the installation, you will be required to install the NVIDIA Run:ai control plane [Helm chart](https://helm.sh/). The Helm charts require Kubernetes administrator permissions. You can review the exact objects that are created by the charts using the `--dry-run` flag on both helm charts.
+
+## Installation
+
+Run the following command. Replace `global.domain=<DOMAIN>` with the one  obtained [here](./system-requirements.md#fully-qualified-domain-name-fqdn)
+
+```bash
+helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \
+--version "<VERSION> " \
+--set global.customCA.enabled=true \
+--set global.domain=<DOMAIN>
+
+Release "runai-backend" does not exist. Installing it now.
+NAME: runai-backend
+LAST DEPLOYED: Mon Dec 30 17:30:19 2024
+NAMESPACE: runai-backend
+STATUS: deployed
+REVISION: 1
+```
+
+!!! Note
+    To install a specific version, add --version <VERSION> to the install command. You can find available versions by running helm search repo -l runai-backend.
+
+## Connect to NVIDIA Run:ai User Interface
+
+1. Open your browser and go to: `https://<DOMAIN>`.
+2. Log in using the default credentials:
+
+    * User: `[email protected]`
+    * Password: `Abcd!234`
+
+You will be prompted to change the password.
+
diff --git a/docs/admin/runai-setup/self-hosted/bcm/network-requirements.md b/docs/admin/runai-setup/self-hosted/bcm/network-requirements.md
@@ -0,0 +1,64 @@
+# Network requirements
+
+The following network requirements are for the NVIDIA Run:ai components installation and usage.
+
+## Installation
+
+### Inbound rules
+
+| Name                        | Description      | Source  | Destination                | Port |
+| --------------------------- | ---------------- | ------- | -------------------------- | ---- |
+| Installation via BCM    | SSH Access | Installer Machine | NVIDIA Base Command Manager headnodes | 22  |
+
+### Outbound rules
+| Name                        | Description      | Source  | Destination                | Port |
+| --------------------------- | ---------------- | ------- | -------------------------- | ---- |
+| Container Registry | Pull NVIDIA Run:ai images                                                        | All kubernetes nodes       | runai.jfrog.io                   | 443  |
+| Helm repository    | NVIDIA Run:ai Helm repository for installation                                   | Installer machine          | runai.jfrog.io                   | 443  |
+
+The NVIDIA Run:ai installation has [software requirements](system-requirements.md) that require additional components to be installed on the cluster. This article includes simple installation examples which can be used optionally and require the following cluster outbound ports to be open:
+
+| Name                       | Description                                | Source               | Destination     | Port |
+| -------------------------- | ------------------------------------------ | -------------------- | --------------- | ---- |
+| Kubernetes Registry        | Ingress Nginx image repository             | All kubernetes nodes | registry.k8s.io | 443  |
+| Google Container Registry  | GPU Operator, and Knative image repository | All kubernetes nodes | gcr.io          | 443  |
+| Red Hat Container Registry | Prometheus Operator image repository       | All kubernetes nodes | quay.io         | 443  |
+| Docker Hub Registry        | Training Operator image repository         | All kubernetes nodes | docker.io       | 443  |
+
+
+
+## External access
+
+Set out below are the domains to whitelist and ports to open for installation, upgrade, and usage of the application and its management.
+
+
+!!! Note
+    Ensure the inbound and outbound rules are correctly applied to your firewall.
+
+### Inbound rules
+
+To allow your organization’s NVIDIA Run:ai users to interact with the cluster using the [NVIDIA Run:ai Command-line interface](../../reference/cli/runai/), or access specific UI features, certain inbound ports need to be open:
+
+| Name                        | Description      | Source  | Destination                | Port |
+| --------------------------- | ---------------- | ------- | -------------------------- | ---- |
+| NVIDIA Run:ai control plane | HTTPS entrypoint | 0.0.0.0 | NVIDIA Run:ai system nodes | 443  |
+| NVIDIA Run:ai cluster       | HTTPS entrypoint | RFC1918 private IP ranges (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16)
+ | NVIDIA Run:ai system nodes | 443  |
+
+
+### Outbound rules
+
+!!! Note
+    Outbound rules applied to the NVIDIA Run:ai cluster component only. In case the NVIDIA Run:ai cluster is installed together with the NVIDIA Run:ai control plane, the NVIDIA Run:ai cluster FQDN refers to the NVIDIA Run:ai control plane FQDN.
+    {% endhint %}
+
+For the NVIDIA Run:ai cluster installation and usage, certain **outbound** ports must be open:
+
+| Name               | Description                                                                      | Source                     | Destination                      | Port |
+| ------------------ | -------------------------------------------------------------------------------- | -------------------------- | -------------------------------- | ---- |
+| Cluster sync       | Sync NVIDIA Run:ai cluster with NVIDIA Run:ai control plane                      | NVIDIA Run:ai system nodes | NVIDIA Run:ai control plane FQDN | 443  |
+| Metric store       | Push NVIDIA Run:ai cluster metrics to NVIDIA Run:ai control plane's metric store | NVIDIA Run:ai system nodes | NVIDIA Run:ai control plane FQDN | 443  |
+
+## Internal network
+
+Ensure that all Kubernetes nodes can communicate with each other across all necessary ports. Kubernetes assumes full interconnectivity between nodes, so you must configure your network to allow this seamless communication. Specific port requirements may vary depending on your network setup.
diff --git a/docs/admin/runai-setup/self-hosted/bcm/next-steps.md b/docs/admin/runai-setup/self-hosted/bcm/next-steps.md
@@ -0,0 +1,11 @@
+# Next Steps
+
+## Restrict System Node Scheduling (Post-Installation)
+
+After installation, you can configure NVIDIA Run:ai to enforce stricter scheduling rules that ensure system components and workloads are assigned to the correct nodes. The following flags are set using the `runaiconfig`. See [Advanced Cluster Configurations](../../../config/advanced-cluster-config.md) for more details.
+
+1. Set `global.NodeAffinity.RestrictRunAISystem=true`. This ensures that NVIDIA Run:ai system components are scheduled only on nodes labeled as system nodes:
+
+2. Set `global.nodeAffinity.restrictScheduling=true`. This prevents pure CPU workloads from being scheduled on GPU nodes. 
+
+
diff --git a/docs/admin/runai-setup/self-hosted/bcm/preparations.md b/docs/admin/runai-setup/self-hosted/bcm/preparations.md
@@ -0,0 +1,13 @@
+# Preparations 
+
+You should receive a token from NVIDIA Run:ai customer support. The following command provides access to the NVIDIA Run:ai container registry: 
+
+```bash
+kubectl create secret docker-registry runai-reg-creds  \
+--docker-server=https://runai.jfrog.io \
+--docker-username=self-hosted-image-puller-prod \
+--docker-password=<$TOKEN> \
+[email protected] \
+--namespace=runai-backend 
+```
+