BERDataLakehouse · Tianhao-Gu · Mar 12, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/configs/ipython_startup/00-notebookutils.py b/configs/ipython_startup/00-notebookutils.py
@@ -103,6 +103,7 @@
     create_tenant_and_assign_users,
     get_group_sql_warehouse,
     get_minio_credentials,
+    get_polaris_credentials,
     get_my_accessible_paths,
     get_my_groups,
     get_my_policies,

diff --git a/configs/ipython_startup/01-credentials.py b/configs/ipython_startup/01-credentials.py
@@ -0,0 +1,51 @@
+"""
+Initialize MinIO and Polaris credentials.
+
+This runs after 00-notebookutils.py loads all the imports, so get_minio_credentials
+is already available in the global namespace.
+"""
+
+# Setup logging
+import logging
+
+logger = logging.getLogger("berdl.startup")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter("%(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+# --- MinIO Credentials ---
+try:
+    # Set MinIO credentials to environment - also creates user if they don't exist
+    credentials = get_minio_credentials()  # noqa: F821
+    logger.info(f"✅ MinIO credentials set for user: {credentials.username}")
+
+except Exception as e:
+    import warnings
+
+    warnings.warn(f"Failed to set MinIO credentials: {str(e)}", UserWarning)
+    logger.error(f"❌ Failed to set MinIO credentials: {str(e)}")
+    credentials = None
+
+# --- Polaris Credentials ---
+try:
+    polaris_creds = get_polaris_credentials()  # noqa: F821
+    if polaris_creds:
+        logger.info(f"✅ Polaris credentials set for catalog: {polaris_creds['personal_catalog']}")
+        if polaris_creds["tenant_catalogs"]:
+            logger.info(f"   Tenant catalogs: {', '.join(polaris_creds['tenant_catalogs'])}")
+        # Clear the settings cache so downstream code (e.g., Spark Connect server startup)
+        # picks up the POLARIS_CREDENTIAL, POLARIS_PERSONAL_CATALOG, and
+        # POLARIS_TENANT_CATALOGS env vars that get_polaris_credentials() just set.
+        get_settings.cache_clear()  # noqa: F821
+    else:
+        logger.info("ℹ️  Polaris not configured, skipping Polaris credential setup")
+
+except Exception as e:
+    import warnings
+
+    warnings.warn(f"Failed to set Polaris credentials: {str(e)}", UserWarning)
+    logger.warning(f"⚠️  Failed to set Polaris credentials: {str(e)}")
+    polaris_creds = None
diff --git a/configs/ipython_startup/01-minio-credentials.py b/configs/ipython_startup/01-minio-credentials.py
diff --git a/configs/jupyter_server_config.py b/configs/jupyter_server_config.py
@@ -6,6 +6,7 @@
 # Note: __file__ may not be defined when exec'd by traitlets config loader
 sys.path.insert(0, "/etc/jupyter")
 
+from berdl_notebook_utils.berdl_settings import get_settings
 from hybridcontents import HybridContentsManager
 from jupyter_server.services.contents.largefilemanager import LargeFileManager
 from grouped_s3_contents import GroupedS3ContentsManager
@@ -134,6 +135,27 @@ def get_user_governance_paths():
     return sources
 
 
+def provision_polaris():
+    """Provision Polaris credentials at server startup and set env vars.
+
+    Called once at Jupyter Server startup so credentials are available
+    before any notebook kernel opens. Subsequent calls from IPython startup
+    scripts will hit the file cache and return immediately.
+    """
+    try:
+        from berdl_notebook_utils.minio_governance import get_polaris_credentials
+
+        polaris_creds = get_polaris_credentials()
+        if polaris_creds:
+            logger.info(f"Polaris credentials provisioned for catalog: {polaris_creds['personal_catalog']}")
+            if polaris_creds["tenant_catalogs"]:
+                logger.info(f"   Tenant catalogs: {', '.join(polaris_creds['tenant_catalogs'])}")
+        else:
+            logger.info("Polaris not configured, skipping Polaris credential provisioning")
+    except Exception as e:
+        logger.error(f"Failed to provision Polaris credentials: {e}")
+
+
 def start_spark_connect():
     """Start Spark Connect server at Jupyter Server startup.
 
@@ -165,10 +187,20 @@ def _start():
 endpoint_url, access_key, secret_key, use_ssl = get_minio_config()
 governance_paths = get_user_governance_paths()
 
-# 3. Start Spark Connect server in background (non-blocking)
+# 3. Provision Polaris credentials — MUST be before Spark Connect so that
+#    POLARIS_CREDENTIAL env vars are set when generating spark-defaults.conf
+provision_polaris()
+
+# Clear the settings cache so start_spark_connect picks up the new
+# POLARIS_CREDENTIAL/POLARIS_PERSONAL_CATALOG/POLARIS_TENANT_CATALOGS env vars
+# that provision_polaris() just set. Without this, the lru_cache returns the
+# stale settings object captured before Polaris provisioning ran.
+get_settings.cache_clear()
+
+# 4. Start Spark Connect server in background (non-blocking)
 start_spark_connect()
 
-# 4. Configure HybridContentsManager
+# 5. Configure HybridContentsManager
 # - Root ("") -> Local filesystem
 # - "datalake_minio" -> GroupedS3ContentsManager with all S3 paths as subdirectories
 c.HybridContentsManager.manager_classes = {

diff --git a/configs/spark-defaults.conf.template b/configs/spark-defaults.conf.template
@@ -43,13 +43,14 @@
 # ==============================================================================
 
 # ------------------------------------------------------------------------------
-# Delta Lake Configuration (STATIC - Server-Side Only)
+# SQL Extensions and Catalog Configuration (STATIC - Server-Side Only)
 # ------------------------------------------------------------------------------
 # These SQL extensions must be loaded when the Spark server starts.
-# They initialize Delta Lake support by registering custom SparkSessionExtensions
-# and catalog implementations that handle Delta table operations.
+# Delta Lake + Iceberg + Sedona run side-by-side.
+# The default catalog remains spark_catalog (Delta/Hive) for backward compatibility.
+# Iceberg catalogs (my, tenant aliases) are added dynamically by connect_server.py.
 
-spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension,org.apache.sedona.sql.SedonaSqlExtensions
+spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.apache.sedona.sql.SedonaSqlExtensions
 spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
 
 # Delta Lake settings
@@ -90,6 +91,9 @@ spark.hadoop.fs.s3a.path.style.access=true
 spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
 spark.hadoop.fs.s3a.connection.ssl.enabled=false
 
+# Polaris Iceberg catalog configuration will be appended dynamically by connect_server.py
+# based on POLARIS_* environment variables (personal catalog "my" + tenant catalogs)
+
 # ------------------------------------------------------------------------------
 # KBase Authentication Interceptor (STATIC - Server-Side Only)
 # ------------------------------------------------------------------------------
@@ -116,7 +120,7 @@ spark.hadoop.fs.s3a.connection.ssl.enabled=false
 # Environment variables used by the namespace interceptor:
 # - BERDL_ALLOWED_NAMESPACE_PREFIXES: Comma-separated allowed prefixes
 #   (e.g., "u_tgu2__,kbase_,research_"). Set dynamically by connect_server.py.
-spark.connect.grpc.interceptor.classes=us.kbase.spark.KBaseAuthServerInterceptor,us.kbase.spark.NamespaceValidationInterceptor
+spark.connect.grpc.interceptor.classes=us.kbase.spark.KBaseAuthServerInterceptor
 
 # ------------------------------------------------------------------------------
 # Session Timeout (STATIC - Server-Side Only)

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -68,7 +68,9 @@
 # - MinIO: minio/minio123
 # - PostgreSQL: hive/hivepassword
 services:
-  spark-notebook:
+  # Service names use the pattern: spark-notebook-{CI_KBASE_USERNAME}
+  # Update these keys if you change the usernames in .env
+  spark-notebook-tgu2:
     # image: ghcr.io/berdatalakehouse/spark_notebook:main
     # platform: linux/amd64
     build:
@@ -85,7 +87,7 @@ services:
       - CDM_TASK_SERVICE_URL=http://localhost:8080
       - SPARK_CLUSTER_MANAGER_API_URL=http://localhost:8000
       - SPARK_MASTER_URL=spark://spark-master:7077
-      - BERDL_POD_IP=spark-notebook
+      - BERDL_POD_IP=spark-notebook-${CI_KBASE_USERNAME}
       - BERDL_HIVE_METASTORE_URI=thrift://hive-metastore:9083
 
       # MINIO CONFIGURATION
@@ -98,6 +100,9 @@ services:
       # DATALAKE MCP SERVER CONFIGURATION
       - DATALAKE_MCP_SERVER_URL=http://datalake-mcp-server:8000/apis/mcp
 
+      # POLARIS CONFIGURATION (per-user credentials provisioned dynamically by 01-credentials.py)
+      - POLARIS_CATALOG_URI=http://polaris:8181/api/catalog
+
       # TRINO CONFIGURATION
       - TRINO_HOST=trino
       - TRINO_PORT=8080
@@ -148,6 +153,9 @@ services:
       - KBASE_ADMIN_ROLES=CDM_JUPYTERHUB_ADMIN
       - KBASE_APPROVED_ROLES=BERDL_USER
       - REDIS_URL=redis://redis:6379
+      # Polaris admin credentials (only the governance service needs root access)
+      - POLARIS_CATALOG_URI=http://polaris:8181/api/catalog
+      - POLARIS_CREDENTIAL=root:s3cr3t
       # Credential store (PostgreSQL)
       - MMS_DB_HOST=postgres
       - MMS_DB_PORT=5432
@@ -182,8 +190,16 @@ services:
       - KBASE_AUTH_URL=https://ci.kbase.us/services/auth/
       - KBASE_REQUIRED_ROLES=BERDL_USER
       - MFA_EXEMPT_USERS=${CI_KBASE_USERNAME}
+      # POLARIS CONFIGURATION (per-user credentials provisioned dynamically)
+      - POLARIS_CATALOG_URI=http://polaris:8181/api/catalog
       - BERDL_REDIS_HOST=redis
       - BERDL_REDIS_PORT=6379
+    volumes:
+      # Mount the shared /home directory to access all users' credentials
+      # This allows the MCP server to dynamically read any user's credentials
+      # from /home/{username}/.berdl_minio_credentials
+      # In K8s: mount the parent directory or use a shared volume
+      - users_home:/home:ro
-    volumes:
-      # Mount the shared /home directory to access all users' credentials
-      # This allows the MCP server to dynamically read any user's credentials
-      # from /home/{username}/.berdl_minio_credentials
-      # In K8s: mount the parent directory or use a shared volume
-      - users_home:/home:ro
-    volumes:
-      # Mount the shared /home directory to access all users' credentials
-      # This allows the MCP server to dynamically read any user's credentials
-      # from /home/{username}/.berdl_minio_credentials
-      # In K8s: mount the parent directory or use a shared volume
-      - users_home:/home:ro
     depends_on:
       - hive-metastore
       - minio
@@ -298,7 +314,13 @@ services:
     volumes:
       - postgres_data:/var/lib/postgresql/data
       - ./scripts/init-postgres-readonly.sh:/docker-entrypoint-initdb.d/01-init-postgres-readonly.sh:ro
-      - ./scripts/init-mms-db.sh:/docker-entrypoint-initdb.d/02-init-mms-db.sh:ro
+      - ./scripts/init-polaris-db.sh:/docker-entrypoint-initdb.d/02-init-polaris-db.sh:ro
+      - ./scripts/init-mms-db.sh:/docker-entrypoint-initdb.d/03-init-mms-db.sh:ro
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U hive"]
+      interval: 5s
+      timeout: 2s
+      retries: 15
 
   hive-metastore:
     # image: ghcr.io/berdatalakehouse/hive_metastore:main
@@ -369,9 +391,73 @@ services:
       echo 'MinIO bucket creation complete.';
       "
 
+  polaris-bootstrap:
+    image: apache/polaris-admin-tool:latest
+    environment:
+      - POLARIS_PERSISTENCE_TYPE=relational-jdbc
+      - QUARKUS_DATASOURCE_DB_KIND=postgresql
+      - QUARKUS_DATASOURCE_JDBC_URL=jdbc:postgresql://postgres:5432/polaris
+      - QUARKUS_DATASOURCE_USERNAME=hive
+      - QUARKUS_DATASOURCE_PASSWORD=hivepassword
+    # Bootstrap exits 3 if already bootstrapped (expected with persistent storage).
+    # Treat exit 3 as success so docker compose doesn't fail on subsequent runs.
+    entrypoint: ["sh", "-c"]
+    command:
+      - |
+        java -jar /deployments/polaris-admin-tool.jar bootstrap --realm=POLARIS --credential=POLARIS,root,s3cr3t
+        rc=$$?
+        if [ $$rc -eq 3 ]; then
+          echo "Already bootstrapped — skipping (OK)"
+          exit 0
+        fi
+        exit $$rc
+    depends_on:
+      postgres:
+        condition: service_healthy
+
+  polaris:
+    image: apache/polaris:latest
+    ports:
+      - "8181:8181"
+    environment:
+      # Persistence — PostgreSQL instead of in-memory
+      - POLARIS_PERSISTENCE_TYPE=relational-jdbc
+      - QUARKUS_DATASOURCE_DB_KIND=postgresql
+      - QUARKUS_DATASOURCE_JDBC_URL=jdbc:postgresql://postgres:5432/polaris
+      - QUARKUS_DATASOURCE_USERNAME=hive
+      - QUARKUS_DATASOURCE_PASSWORD=hivepassword
+      # Realm configuration
+      - POLARIS_REALM_NAME=default-realm
+      # MinIO credentials for Polaris's own S3 access (metadata files).
+      # Polaris reads endpointInternal + pathStyleAccess from each catalog's storageConfigInfo.
+      # STS is disabled per-catalog via stsUnavailable:true (not the global SKIP_CREDENTIAL flag).
+      - AWS_REGION=us-east-1
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=minio123
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8182/q/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    depends_on:
+      polaris-bootstrap:
+        condition: service_completed_successfully
+
+  polaris-ui:
+    image: ghcr.io/binarycat0/apache-polaris-ui:latest
+    ports:
+      - "3000:3000"
+    environment:
+      # Server-side env vars used by the Next.js API routes for proxying auth
+      - POLARIS_MANAGEMENT_API_URL=http://polaris:8181/api/management/v1
+      - POLARIS_CATALOG_API_URL=http://polaris:8181/api/catalog/v1
+    depends_on:
+      polaris:
+        condition: service_started
+
 volumes:
   postgres_data:
   minio_data:
   redis_data:
   global_share:
-  users_home:  # Shared volume for all user home directories
+  users_home:  # Shared volume for all user home directories
diff --git a/docs/data_sharing_guide.md b/docs/data_sharing_guide.md
@@ -45,11 +45,13 @@ All BERDL JupyterHub notebooks automatically import these data governance functi
 
 **Pre-Initialized Client:**
 - `governance` - Pre-initialized `DataGovernanceClient()` instance for advanced operations
-
 **Other Auto-Imported Functions:**
-- `get_spark_session()` - Create Spark sessions with Delta Lake support
+- `get_spark_session()` - Create Spark sessions with Iceberg + Delta Lake support
+- `create_namespace_if_not_exists()` - Create namespaces (use `iceberg=True` for Iceberg catalogs)
 - Plus many other utility functions for data operations
 
+> **Note:** With the migration to Iceberg, **tenant catalogs** are the recommended way to share data. Create tables in a tenant catalog (e.g., `kbase`) and all members can access them. See the [Iceberg Migration Guide](iceberg_migration_guide.md) for details.
+
 ### Quick Start
 
 ```python
@@ -258,7 +260,7 @@ if response.errors:
 
 ## Public and Private Table Access (DEPRECATED)
 
-> **⚠️ DEPRECATION WARNING**: Direct public path sharing functions (`make_table_public`, `make_table_private`) are deprecated. Please create a namespace under the `globalusers` tenant for public sharing activities instead.
+> **⚠️ DEPRECATION WARNING**: Direct public path sharing functions (`make_table_public`, `make_table_private`) are deprecated. Please create a namespace under the `kbase` tenant for public sharing activities instead.
 
 ### Make Tables Publicly Accessible