Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/ipython_startup/00-notebookutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
create_tenant_and_assign_users,
get_group_sql_warehouse,
get_minio_credentials,
get_polaris_credentials,
get_my_accessible_paths,
get_my_groups,
get_my_policies,
Expand Down
51 changes: 51 additions & 0 deletions configs/ipython_startup/01-credentials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Initialize MinIO and Polaris credentials.

This runs after 00-notebookutils.py loads all the imports, so get_minio_credentials
is already available in the global namespace.
"""

# Setup logging
import logging

logger = logging.getLogger("berdl.startup")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

# --- MinIO Credentials ---
try:
# Set MinIO credentials to environment - also creates user if they don't exist
credentials = get_minio_credentials() # noqa: F821
logger.info(f"✅ MinIO credentials set for user: {credentials.username}")

except Exception as e:
import warnings

warnings.warn(f"Failed to set MinIO credentials: {str(e)}", UserWarning)
logger.error(f"❌ Failed to set MinIO credentials: {str(e)}")
credentials = None

# --- Polaris Credentials ---
try:
polaris_creds = get_polaris_credentials() # noqa: F821
if polaris_creds:
logger.info(f"✅ Polaris credentials set for catalog: {polaris_creds['personal_catalog']}")
if polaris_creds["tenant_catalogs"]:
logger.info(f" Tenant catalogs: {', '.join(polaris_creds['tenant_catalogs'])}")
# Clear the settings cache so downstream code (e.g., Spark Connect server startup)
# picks up the POLARIS_CREDENTIAL, POLARIS_PERSONAL_CATALOG, and
# POLARIS_TENANT_CATALOGS env vars that get_polaris_credentials() just set.
get_settings.cache_clear() # noqa: F821
else:
logger.info("ℹ️ Polaris not configured, skipping Polaris credential setup")

except Exception as e:
import warnings

warnings.warn(f"Failed to set Polaris credentials: {str(e)}", UserWarning)
logger.warning(f"⚠️ Failed to set Polaris credentials: {str(e)}")
polaris_creds = None
29 changes: 0 additions & 29 deletions configs/ipython_startup/01-minio-credentials.py

This file was deleted.

36 changes: 34 additions & 2 deletions configs/jupyter_server_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# Note: __file__ may not be defined when exec'd by traitlets config loader
sys.path.insert(0, "/etc/jupyter")

from berdl_notebook_utils.berdl_settings import get_settings
from hybridcontents import HybridContentsManager
from jupyter_server.services.contents.largefilemanager import LargeFileManager
from grouped_s3_contents import GroupedS3ContentsManager
Expand Down Expand Up @@ -134,6 +135,27 @@ def get_user_governance_paths():
return sources


def provision_polaris():
"""Provision Polaris credentials at server startup and set env vars.

Called once at Jupyter Server startup so credentials are available
before any notebook kernel opens. Subsequent calls from IPython startup
scripts will hit the file cache and return immediately.
"""
try:
from berdl_notebook_utils.minio_governance import get_polaris_credentials

polaris_creds = get_polaris_credentials()
if polaris_creds:
logger.info(f"Polaris credentials provisioned for catalog: {polaris_creds['personal_catalog']}")
if polaris_creds["tenant_catalogs"]:
logger.info(f" Tenant catalogs: {', '.join(polaris_creds['tenant_catalogs'])}")
else:
logger.info("Polaris not configured, skipping Polaris credential provisioning")
except Exception as e:
logger.error(f"Failed to provision Polaris credentials: {e}")


def start_spark_connect():
"""Start Spark Connect server at Jupyter Server startup.

Expand Down Expand Up @@ -165,10 +187,20 @@ def _start():
endpoint_url, access_key, secret_key, use_ssl = get_minio_config()
governance_paths = get_user_governance_paths()

# 3. Start Spark Connect server in background (non-blocking)
# 3. Provision Polaris credentials — MUST be before Spark Connect so that
# POLARIS_CREDENTIAL env vars are set when generating spark-defaults.conf
provision_polaris()

# Clear the settings cache so start_spark_connect picks up the new
# POLARIS_CREDENTIAL/POLARIS_PERSONAL_CATALOG/POLARIS_TENANT_CATALOGS env vars
# that provision_polaris() just set. Without this, the lru_cache returns the
# stale settings object captured before Polaris provisioning ran.
get_settings.cache_clear()

# 4. Start Spark Connect server in background (non-blocking)
start_spark_connect()

# 4. Configure HybridContentsManager
# 5. Configure HybridContentsManager
# - Root ("") -> Local filesystem
# - "datalake_minio" -> GroupedS3ContentsManager with all S3 paths as subdirectories
c.HybridContentsManager.manager_classes = {
Expand Down
14 changes: 9 additions & 5 deletions configs/spark-defaults.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@
# ==============================================================================

# ------------------------------------------------------------------------------
# Delta Lake Configuration (STATIC - Server-Side Only)
# SQL Extensions and Catalog Configuration (STATIC - Server-Side Only)
# ------------------------------------------------------------------------------
# These SQL extensions must be loaded when the Spark server starts.
# They initialize Delta Lake support by registering custom SparkSessionExtensions
# and catalog implementations that handle Delta table operations.
# Delta Lake + Iceberg + Sedona run side-by-side.
# The default catalog remains spark_catalog (Delta/Hive) for backward compatibility.
# Iceberg catalogs (my, tenant aliases) are added dynamically by connect_server.py.

spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension,org.apache.sedona.sql.SedonaSqlExtensions
spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.apache.sedona.sql.SedonaSqlExtensions
spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog

# Delta Lake settings
Expand Down Expand Up @@ -90,6 +91,9 @@ spark.hadoop.fs.s3a.path.style.access=true
spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.connection.ssl.enabled=false

# Polaris Iceberg catalog configuration will be appended dynamically by connect_server.py
# based on POLARIS_* environment variables (personal catalog "my" + tenant catalogs)

# ------------------------------------------------------------------------------
# KBase Authentication Interceptor (STATIC - Server-Side Only)
# ------------------------------------------------------------------------------
Expand All @@ -116,7 +120,7 @@ spark.hadoop.fs.s3a.connection.ssl.enabled=false
# Environment variables used by the namespace interceptor:
# - BERDL_ALLOWED_NAMESPACE_PREFIXES: Comma-separated allowed prefixes
# (e.g., "u_tgu2__,kbase_,research_"). Set dynamically by connect_server.py.
spark.connect.grpc.interceptor.classes=us.kbase.spark.KBaseAuthServerInterceptor,us.kbase.spark.NamespaceValidationInterceptor
spark.connect.grpc.interceptor.classes=us.kbase.spark.KBaseAuthServerInterceptor

# ------------------------------------------------------------------------------
# Session Timeout (STATIC - Server-Side Only)
Expand Down
94 changes: 90 additions & 4 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@
# - MinIO: minio/minio123
# - PostgreSQL: hive/hivepassword
services:
spark-notebook:
# Service names use the pattern: spark-notebook-{CI_KBASE_USERNAME}
# Update these keys if you change the usernames in .env
spark-notebook-tgu2:
# image: ghcr.io/berdatalakehouse/spark_notebook:main
# platform: linux/amd64
Comment on lines +71 to 75
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The notebook service key was renamed to spark-notebook-tgu2, but this compose file still contains other references to the old hostname "spark-notebook" (e.g., SERVICE_TEMPLATE and SPARK_CONNECT_URL_TEMPLATE). As-is, those services will likely fail to connect/resolve the notebook container; please update the remaining references to match the new naming pattern (spark-notebook-${CI_KBASE_USERNAME}).

Copilot uses AI. Check for mistakes.
build:
Expand All @@ -85,7 +87,7 @@ services:
- CDM_TASK_SERVICE_URL=http://localhost:8080
- SPARK_CLUSTER_MANAGER_API_URL=http://localhost:8000
- SPARK_MASTER_URL=spark://spark-master:7077
- BERDL_POD_IP=spark-notebook
- BERDL_POD_IP=spark-notebook-${CI_KBASE_USERNAME}
- BERDL_HIVE_METASTORE_URI=thrift://hive-metastore:9083

# MINIO CONFIGURATION
Expand All @@ -98,6 +100,9 @@ services:
# DATALAKE MCP SERVER CONFIGURATION
- DATALAKE_MCP_SERVER_URL=http://datalake-mcp-server:8000/apis/mcp

# POLARIS CONFIGURATION (per-user credentials provisioned dynamically by 01-credentials.py)
- POLARIS_CATALOG_URI=http://polaris:8181/api/catalog

# TRINO CONFIGURATION
- TRINO_HOST=trino
- TRINO_PORT=8080
Expand Down Expand Up @@ -148,6 +153,9 @@ services:
- KBASE_ADMIN_ROLES=CDM_JUPYTERHUB_ADMIN
- KBASE_APPROVED_ROLES=BERDL_USER
- REDIS_URL=redis://redis:6379
# Polaris admin credentials (only the governance service needs root access)
- POLARIS_CATALOG_URI=http://polaris:8181/api/catalog
- POLARIS_CREDENTIAL=root:s3cr3t
# Credential store (PostgreSQL)
- MMS_DB_HOST=postgres
- MMS_DB_PORT=5432
Expand Down Expand Up @@ -182,8 +190,16 @@ services:
- KBASE_AUTH_URL=https://ci.kbase.us/services/auth/
- KBASE_REQUIRED_ROLES=BERDL_USER
- MFA_EXEMPT_USERS=${CI_KBASE_USERNAME}
# POLARIS CONFIGURATION (per-user credentials provisioned dynamically)
- POLARIS_CATALOG_URI=http://polaris:8181/api/catalog
- BERDL_REDIS_HOST=redis
- BERDL_REDIS_PORT=6379
volumes:
# Mount the shared /home directory to access all users' credentials
# This allows the MCP server to dynamically read any user's credentials
# from /home/{username}/.berdl_minio_credentials
# In K8s: mount the parent directory or use a shared volume
- users_home:/home:ro
Comment on lines +197 to +202
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment says the MCP server reads /home/{username}/.berdl_minio_credentials, but this PR introduces .berdl_polaris_credentials and doesn’t create a .berdl_minio_credentials file. Please update the comment. Also, mounting the entire /home volume into the MCP server can expose cached secrets (e.g., Polaris client_secret) across users; prefer fetching per-request from the governance API or mounting only the minimal required path/file.

Suggested change
volumes:
# Mount the shared /home directory to access all users' credentials
# This allows the MCP server to dynamically read any user's credentials
# from /home/{username}/.berdl_minio_credentials
# In K8s: mount the parent directory or use a shared volume
- users_home:/home:ro

Copilot uses AI. Check for mistakes.
depends_on:
- hive-metastore
- minio
Expand Down Expand Up @@ -298,7 +314,13 @@ services:
volumes:
- postgres_data:/var/lib/postgresql/data
- ./scripts/init-postgres-readonly.sh:/docker-entrypoint-initdb.d/01-init-postgres-readonly.sh:ro
- ./scripts/init-mms-db.sh:/docker-entrypoint-initdb.d/02-init-mms-db.sh:ro
- ./scripts/init-polaris-db.sh:/docker-entrypoint-initdb.d/02-init-polaris-db.sh:ro
- ./scripts/init-mms-db.sh:/docker-entrypoint-initdb.d/03-init-mms-db.sh:ro
healthcheck:
test: ["CMD-SHELL", "pg_isready -U hive"]
interval: 5s
timeout: 2s
retries: 15

hive-metastore:
# image: ghcr.io/berdatalakehouse/hive_metastore:main
Expand Down Expand Up @@ -369,9 +391,73 @@ services:
echo 'MinIO bucket creation complete.';
"

polaris-bootstrap:
image: apache/polaris-admin-tool:latest
environment:
- POLARIS_PERSISTENCE_TYPE=relational-jdbc
- QUARKUS_DATASOURCE_DB_KIND=postgresql
- QUARKUS_DATASOURCE_JDBC_URL=jdbc:postgresql://postgres:5432/polaris
- QUARKUS_DATASOURCE_USERNAME=hive
- QUARKUS_DATASOURCE_PASSWORD=hivepassword
# Bootstrap exits 3 if already bootstrapped (expected with persistent storage).
# Treat exit 3 as success so docker compose doesn't fail on subsequent runs.
entrypoint: ["sh", "-c"]
command:
- |
java -jar /deployments/polaris-admin-tool.jar bootstrap --realm=POLARIS --credential=POLARIS,root,s3cr3t
rc=$$?
if [ $$rc -eq 3 ]; then
echo "Already bootstrapped — skipping (OK)"
exit 0
fi
exit $$rc
depends_on:
postgres:
condition: service_healthy

polaris:
image: apache/polaris:latest
ports:
Comment on lines +419 to +420
- "8181:8181"
environment:
# Persistence — PostgreSQL instead of in-memory
- POLARIS_PERSISTENCE_TYPE=relational-jdbc
- QUARKUS_DATASOURCE_DB_KIND=postgresql
- QUARKUS_DATASOURCE_JDBC_URL=jdbc:postgresql://postgres:5432/polaris
- QUARKUS_DATASOURCE_USERNAME=hive
- QUARKUS_DATASOURCE_PASSWORD=hivepassword
# Realm configuration
- POLARIS_REALM_NAME=default-realm
# MinIO credentials for Polaris's own S3 access (metadata files).
# Polaris reads endpointInternal + pathStyleAccess from each catalog's storageConfigInfo.
# STS is disabled per-catalog via stsUnavailable:true (not the global SKIP_CREDENTIAL flag).
- AWS_REGION=us-east-1
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=minio123
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8182/q/health"]
interval: 10s
timeout: 5s
retries: 5
depends_on:
polaris-bootstrap:
condition: service_completed_successfully

polaris-ui:
image: ghcr.io/binarycat0/apache-polaris-ui:latest
ports:
- "3000:3000"
environment:
# Server-side env vars used by the Next.js API routes for proxying auth
- POLARIS_MANAGEMENT_API_URL=http://polaris:8181/api/management/v1
- POLARIS_CATALOG_API_URL=http://polaris:8181/api/catalog/v1
depends_on:
polaris:
condition: service_started

volumes:
postgres_data:
minio_data:
redis_data:
global_share:
users_home: # Shared volume for all user home directories
users_home: # Shared volume for all user home directories
8 changes: 5 additions & 3 deletions docs/data_sharing_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ All BERDL JupyterHub notebooks automatically import these data governance functi

**Pre-Initialized Client:**
- `governance` - Pre-initialized `DataGovernanceClient()` instance for advanced operations

**Other Auto-Imported Functions:**
- `get_spark_session()` - Create Spark sessions with Delta Lake support
- `get_spark_session()` - Create Spark sessions with Iceberg + Delta Lake support
- `create_namespace_if_not_exists()` - Create namespaces (use `iceberg=True` for Iceberg catalogs)
- Plus many other utility functions for data operations

> **Note:** With the migration to Iceberg, **tenant catalogs** are the recommended way to share data. Create tables in a tenant catalog (e.g., `kbase`) and all members can access them. See the [Iceberg Migration Guide](iceberg_migration_guide.md) for details.

### Quick Start

```python
Expand Down Expand Up @@ -258,7 +260,7 @@ if response.errors:

## Public and Private Table Access (DEPRECATED)

> **⚠️ DEPRECATION WARNING**: Direct public path sharing functions (`make_table_public`, `make_table_private`) are deprecated. Please create a namespace under the `globalusers` tenant for public sharing activities instead.
> **⚠️ DEPRECATION WARNING**: Direct public path sharing functions (`make_table_public`, `make_table_private`) are deprecated. Please create a namespace under the `kbase` tenant for public sharing activities instead.

### Make Tables Publicly Accessible

Expand Down
Loading
Loading