From 39282be98f9b09fe9bfb8eb39d21ac1a3301c319 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Fri, 28 Feb 2025 19:30:23 -0700
Subject: [PATCH 01/35] adding new model images for training and inference

---
 model_docker_images/Readme.md                 |   0
 model_docker_images/inference/Dockerfile      |  14 ++
 model_docker_images/inference/main.py         | 142 ++++++++++++++
 .../inference/requirements.txt                |   7 +
 .../inference/run_inference_container.sh      |  17 ++
 model_docker_images/scripts/build_deploy.sh   | 178 ++++++++++++++++++
 model_docker_images/scripts/test_inference.py |  91 +++++++++
 model_docker_images/scripts/test_training.py  | 149 +++++++++++++++
 model_docker_images/training/Dockerfile       |  14 ++
 model_docker_images/training/requirements.txt |   5 +
 .../training/run_training_container.sh        |  17 ++
 model_docker_images/training/train.py         | 154 +++++++++++++++
 12 files changed, 788 insertions(+)
 create mode 100644 model_docker_images/Readme.md
 create mode 100644 model_docker_images/inference/Dockerfile
 create mode 100644 model_docker_images/inference/main.py
 create mode 100644 model_docker_images/inference/requirements.txt
 create mode 100755 model_docker_images/inference/run_inference_container.sh
 create mode 100755 model_docker_images/scripts/build_deploy.sh
 create mode 100644 model_docker_images/scripts/test_inference.py
 create mode 100644 model_docker_images/scripts/test_training.py
 create mode 100644 model_docker_images/training/Dockerfile
 create mode 100644 model_docker_images/training/requirements.txt
 create mode 100644 model_docker_images/training/run_training_container.sh
 create mode 100644 model_docker_images/training/train.py

diff --git a/model_docker_images/Readme.md b/model_docker_images/Readme.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile
new file mode 100644
index 000000000..5130b7831
--- /dev/null
+++ b/model_docker_images/inference/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.12-slim
+
+# Copy requirements file
+COPY requirements.txt /tmp/
+
+# Install dependencies
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+# Copy your server code
+COPY main.py /app/
+WORKDIR /app
+
+# Run the API server
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
\ No newline at end of file
diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
new file mode 100644
index 000000000..75b369396
--- /dev/null
+++ b/model_docker_images/inference/main.py
@@ -0,0 +1,142 @@
+from fastapi import FastAPI, Request, Response
+from contextlib import asynccontextmanager
+import os
+import json
+import numpy as np
+import pandas as pd
+import joblib
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Model will be accessible globally
+model = None
+model_metadata = None
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Load model on startup
+    global model, model_metadata
+
+    # SageMaker model path
+    model_path = os.environ.get('MODEL_PATH', '/opt/ml/model')
+
+    try:
+        logger.info(f"Loading model from {model_path}")
+        model_file = os.path.join(model_path, 'model.joblib')
+
+        # Check if model file exists
+        if not os.path.exists(model_file):
+            logger.warning(f"Model file not found at {model_file}")
+            # List directory contents for debugging
+            if os.path.exists(model_path):
+                logger.info(f"Contents of {model_path}: {os.listdir(model_path)}")
+            else:
+                logger.warning(f"Model directory {model_path} not found")
+
+            # For testing only - create a dummy model
+            logger.warning("Creating a dummy model for testing")
+            import xgboost as xgb
+            model = xgb.XGBRegressor()
+            model.fit(np.array([[1, 2, 3]]), np.array([1]))
+        else:
+            # Load the actual model
+            logger.info(f"Loading model from {model_file}")
+            model = joblib.load(model_file)
+            logger.info(f"Model loaded successfully: {type(model)}")
+
+        # Load metadata if available
+        try:
+            metadata_file = os.path.join(model_path, 'metadata.json')
+            if os.path.exists(metadata_file):
+                with open(metadata_file, 'r') as f:
+                    model_metadata = json.load(f)
+                logger.info(f"Loaded model metadata: {model_metadata}")
+            else:
+                logger.warning(f"Metadata file not found at {metadata_file}")
+                model_metadata = {'feature_names': None}
+        except Exception as e:
+            logger.error(f"Error loading model metadata: {e}")
+            model_metadata = {'feature_names': None}
+    except Exception as e:
+        logger.error(f"Error loading model: {e}", exc_info=True)
+        # Provide a fallback model for testing
+        import xgboost as xgb
+        model = xgb.XGBRegressor()
+        model.fit(np.array([[1, 2, 3]]), np.array([1]))
+        model_metadata = {'feature_names': None}
+
+    logger.info("Model initialization complete")
+    yield
+
+    # Cleanup on shutdown if needed
+    logger.info("Cleaning up resources")
+
+
+app = FastAPI(lifespan=lifespan)
+
+
+@app.get('/ping')
+def ping():
+    # SageMaker health check - return 200 if model is loaded
+    if model is not None:
+        return Response(status_code=200)
+    return Response(status_code=404)
+
+
+@app.post('/invocations')
+async def invoke(request: Request):
+    logger.info("Received inference request")
+    content_type = request.headers.get('Content-Type', '')
+    accept_type = request.headers.get('Accept', '')
+
+    logger.info(f"Content-Type: {content_type}, Accept: {accept_type}")
+
+    # Get the data
+    body = await request.body()
+
+    try:
+        # Handle different content types
+        if content_type == 'text/csv':
+            # Parse CSV data
+            s = body.decode('utf-8')
+            data = pd.read_csv(pd.StringIO(s), header=None)
+            logger.info(f"Parsed CSV data with shape: {data.shape}")
+        else:
+            # Default to JSON
+            json_str = body.decode('utf-8')
+            logger.info(f"Raw JSON input: {json_str}")
+            data_json = json.loads(json_str)
+            logger.info(f"Parsed JSON data: {data_json}")
+            # Convert to DataFrame if it's not already
+            if not isinstance(data_json, pd.DataFrame):
+                data = pd.DataFrame(data_json)
+            else:
+                data = data_json
+
+        # Make prediction
+        logger.info(f"Making prediction with data shape: {data.shape}")
+        predictions = model.predict(data)
+        logger.info(f"Prediction successful, result shape: {len(predictions) if hasattr(predictions, '__len__') else 'scalar'}")
+
+        # Always return JSON unless explicitly requested as CSV
+        if accept_type == 'text/csv':
+            result = pd.DataFrame(predictions).to_csv(header=False, index=False)
+            logger.info(f"Returning CSV response: {result}")
+            return Response(content=result, media_type='text/csv')
+        else:
+            # Default to JSON for everything else
+            result = json.dumps({'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions)})
+            logger.info(f"Returning JSON response: {result}")
+            return Response(content=result, media_type='application/json')
+
+    except Exception as e:
+        logger.error(f"Error during inference: {e}", exc_info=True)
+        return Response(
+            content=json.dumps({"error": str(e)}),
+            status_code=500,
+            media_type="application/json"
+        )
diff --git a/model_docker_images/inference/requirements.txt b/model_docker_images/inference/requirements.txt
new file mode 100644
index 000000000..ea8a26be8
--- /dev/null
+++ b/model_docker_images/inference/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.115.10
+uvicorn==0.34.0
+scikit-learn==1.6.1
+xgboost-cpu==2.1.4
+pandas==2.2.3
+awswrangler==3.11.0
+joblib==1.4.2
\ No newline at end of file
diff --git a/model_docker_images/inference/run_inference_container.sh b/model_docker_images/inference/run_inference_container.sh
new file mode 100755
index 000000000..e643f3260
--- /dev/null
+++ b/model_docker_images/inference/run_inference_container.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -e
+
+echo "🚀 Starting AWS Model Inference Container..."
+docker run -d -p 8080:8080 --name aws_model_test aws_model_image:0.1
+
+echo "⏳ Waiting for server to initialize (5 seconds)..."
+sleep 5
+
+echo "🧪 Running tests against the server..."
+python test_inference.py
+
+echo "🧹 Cleaning up - stopping and removing container..."
+docker stop aws_model_test
+docker rm aws_model_test
+
+echo "✅ Done!"
\ No newline at end of file
diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh
new file mode 100755
index 000000000..236dc3dcf
--- /dev/null
+++ b/model_docker_images/scripts/build_deploy.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+set -e
+
+# Get the directory of this script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+# Get the parent directory (project root)
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+# Configuration
+TRAINING_DIR="$PROJECT_ROOT/training"
+INFERENCE_DIR="$PROJECT_ROOT/inference"
+TRAINING_IMAGE="aws_model_training"
+INFERENCE_IMAGE="aws_model_inference"
+IMAGE_VERSION=${1:-"0.1"}
+
+# Expect AWS_PROFILE to be set in the environment when deploying
+if [ "$2" == "--deploy" ]; then
+    : "${AWS_PROFILE:?AWS_PROFILE environment variable is not set.}"
+fi
+
+# Define the regions to deploy to.
+REGION_LIST=("us-east-1" "us-west-2")
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Parse arguments
+DEPLOY=false
+LATEST=false
+for arg in "$@"; do
+    case $arg in
+        --deploy)
+            DEPLOY=true
+            ;;
+        --latest)
+            LATEST=true
+            ;;
+        *)
+            ;;
+    esac
+done
+
+# Function to build a Docker image
+build_image() {
+    local dir=$1
+    local image_name=$2
+    local tag=$3
+    local full_name="${image_name}:${tag}"
+
+    echo -e "${YELLOW}Building image: ${full_name}${NC}"
+
+    # Check if Dockerfile exists
+    if [ ! -f "$dir/Dockerfile" ]; then
+        echo "❌ Error: Dockerfile not found in $dir"
+        return 1
+    fi
+
+    # Build the image for AMD64 architecture
+    echo "Building local Docker image ${full_name} for linux/amd64..."
+    docker build --platform linux/amd64 -t $full_name $dir
+
+    echo -e "${GREEN}✅ Successfully built: ${full_name}${NC}"
+    return 0
+}
+
+# Function to deploy an image to ECR
+deploy_image() {
+    local image_name=$1
+    local tag=$2
+    local use_latest=$3
+    local full_name="${image_name}:${tag}"
+
+    for REGION in "${REGION_LIST[@]}"; do
+        echo "Processing region: ${REGION}"
+        # Construct the ECR repository URL (using your account ID 507740646243)
+        ECR_REPO="507740646243.dkr.ecr.${REGION}.amazonaws.com/model_images/${image_name}"
+        AWS_ECR_IMAGE="${ECR_REPO}:${tag}"
+
+        echo "Logging in to AWS ECR in ${REGION}..."
+        aws ecr get-login-password --region ${REGION} --profile ${AWS_PROFILE} | \
+            docker login --username AWS --password-stdin ${ECR_REPO}
+
+        echo "Tagging image for AWS ECR as ${AWS_ECR_IMAGE}..."
+        docker tag ${full_name} ${AWS_ECR_IMAGE}
+
+        echo "Pushing Docker image to AWS ECR: ${AWS_ECR_IMAGE}..."
+        docker push ${AWS_ECR_IMAGE}
+
+        if [ "$use_latest" = true ]; then
+            AWS_ECR_LATEST="${ECR_REPO}:latest"
+            echo "Tagging AWS ECR image as latest: ${AWS_ECR_LATEST}..."
+            docker tag ${full_name} ${AWS_ECR_LATEST}
+            echo "Pushing Docker image to AWS ECR: ${AWS_ECR_LATEST}..."
+            docker push ${AWS_ECR_LATEST}
+        fi
+    done
+}
+
+# Build training image
+echo "======================================"
+echo "🏗️  Building training container"
+echo "======================================"
+build_image "$TRAINING_DIR" "$TRAINING_IMAGE" "$IMAGE_VERSION"
+
+# Build inference image
+echo "======================================"
+echo "🏗️  Building inference container"
+echo "======================================"
+build_image "$INFERENCE_DIR" "$INFERENCE_IMAGE" "$IMAGE_VERSION"
+
+echo "======================================"
+echo -e "${GREEN}✅ All builds completed successfully!${NC}"
+echo "======================================"
+
+if [ "$DEPLOY" = true ]; then
+    echo "======================================"
+    echo "🚀 Deploying containers to ECR"
+    echo "======================================"
+
+    # Deploy training image
+    echo "Deploying training image..."
+    deploy_image "$TRAINING_IMAGE" "$IMAGE_VERSION" "$LATEST"
+
+    # Deploy inference image
+    echo "Deploying inference image..."
+    deploy_image "$INFERENCE_IMAGE" "$IMAGE_VERSION" "$LATEST"
+
+    echo "======================================"
+    echo -e "${GREEN}✅ Deployment complete!${NC}"
+    echo "======================================"
+else
+    echo "Local build complete. Use --deploy to push the images to AWS ECR in regions: ${REGION_LIST[*]}."
+
+    # Print information about the built images
+    echo "======================================"
+    echo "📋 Image information:"
+    echo "Training image: ${TRAINING_IMAGE}:${IMAGE_VERSION}"
+    echo "Inference image: ${INFERENCE_IMAGE}:${IMAGE_VERSION}"
+    echo "======================================"
+
+    # Ask if user wants to test the containers
+    read -p "Do you want to test the containers? (y/n) " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        # Test training container
+        echo "======================================"
+        echo "🧪 Testing training container"
+        echo "======================================"
+        python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}"
+
+        # Test inference container
+        echo "======================================"
+        echo "🧪 Testing inference container"
+        echo "======================================"
+
+        # Start the inference container in the background
+        echo "Starting inference container..."
+        CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}")
+
+        # Wait for the container to initialize
+        echo "Waiting for server to initialize (5 seconds)..."
+        sleep 5
+
+        # Run the test
+        python "$SCRIPT_DIR/test_inference.py"
+
+        # Stop and remove the container
+        echo "Stopping inference container..."
+        docker stop $CONTAINER_ID
+        docker rm $CONTAINER_ID
+
+        echo "======================================"
+        echo -e "${GREEN}✅ Testing completed!${NC}"
+        echo "======================================"
+    fi
+fi
\ No newline at end of file
diff --git a/model_docker_images/scripts/test_inference.py b/model_docker_images/scripts/test_inference.py
new file mode 100644
index 000000000..223aa1dc7
--- /dev/null
+++ b/model_docker_images/scripts/test_inference.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+import requests
+import json
+import argparse
+import time
+
+
+def test_inference_server(host="localhost", port=8080):
+    """
+    Test the inference server running in the Docker container.
+    """
+    base_url = f"http://{host}:{port}"
+
+    # Test 1: Check the health endpoint
+    print("\n🔍 Testing /ping endpoint (health check)...")
+    try:
+        response = requests.get(f"{base_url}/ping", timeout=5)
+        if response.status_code == 200:
+            print("✅ Health check succeeded")
+        else:
+            print(f"❌ Health check failed with status code: {response.status_code}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"❌ Health check failed with error: {e}")
+        print("Is the Docker container running on the specified port?")
+        return False
+
+    # Test 2: Test the invocations endpoint with simple data
+    print("\n🔍 Testing /invocations endpoint with sample data...")
+    sample_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+
+    try:
+        # Test with JSON data
+        response = requests.post(
+            f"{base_url}/invocations",
+            data=json.dumps(sample_data),
+            headers={"Content-Type": "application/json", "Accept": "application/json"},
+            timeout=5
+        )
+
+        if response.status_code == 200:
+            print("✅ Inference request succeeded")
+            try:
+                # Parse the JSON response
+                result = response.json()
+                print(f"📊 Response: {result}")
+                return True
+            except json.JSONDecodeError as e:
+                print(f"❌ Error parsing response as JSON: {e}")
+                print(f"Raw response: {response.text}")
+                # Try parsing as CSV
+                try:
+                    lines = response.text.strip().split('\n')
+                    values = [float(line) for line in lines]
+                    print(f"📊 CSV Response (converted): {values}")
+                    return True
+                except Exception:
+                    return False
+        else:
+            print(f"❌ Inference request failed with status code: {response.status_code}")
+            print(f"Response text: {response.text}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"❌ Inference request failed with error: {e}")
+        return False
+
+    print("\n🎉 All tests passed! Your inference server is working correctly.")
+    return True
+
+
+def run_docker_command():
+    """
+    Print the docker run command to help the user start the container.
+    """
+    print("\n📋 To run your Docker container, use the following command:")
+    print("docker run -p 8080:8080 aws_model_inference:latest")
+    print("\nThis maps port 8080 from the container to port 8080 on your host machine.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test the AWS model inference server")
+    parser.add_argument("--host", default="localhost", help="Host where the inference server is running")
+    parser.add_argument("--port", type=int, default=8080, help="Port where the inference server is running")
+    parser.add_argument("--docker-cmd", action="store_true", help="Print the docker run command")
+
+    args = parser.parse_args()
+
+    if args.docker_cmd:
+        run_docker_command()
+
+    test_inference_server(args.host, args.port)
diff --git a/model_docker_images/scripts/test_training.py b/model_docker_images/scripts/test_training.py
new file mode 100644
index 000000000..20956e73a
--- /dev/null
+++ b/model_docker_images/scripts/test_training.py
@@ -0,0 +1,149 @@
+import os
+import json
+import argparse
+import tempfile
+import shutil
+import subprocess
+import numpy as np
+import pandas as pd
+
+
+def create_test_data(data_dir, rows=100, cols=5):
+    """Create synthetic training data for testing."""
+    print(f"Creating synthetic training data in {data_dir}")
+
+    # Generate synthetic features and target
+    X = np.random.randn(rows, cols)
+    y = 2 * X[:, 0] + 3 * X[:, 1] - 1.5 * X[:, 2] + 0.5 * X[:, 3] - X[:, 4] + np.random.randn(rows) * 0.1
+
+    # Create dataframe
+    cols = [f"feature_{i}" for i in range(cols)]
+    df = pd.DataFrame(X, columns=cols)
+    df['target'] = y
+
+    # Create train directory
+    train_dir = os.path.join(data_dir, 'train')
+    os.makedirs(train_dir, exist_ok=True)
+
+    # Save to CSV
+    train_file = os.path.join(train_dir, 'train.csv')
+    df.to_csv(train_file, index=False)
+    print(f"Saved {rows} rows of training data to {train_file}")
+
+    return train_file
+
+
+def create_hyperparameters(config_dir):
+    """Create hyperparameters.json file for the training container."""
+    print(f"Creating hyperparameters in {config_dir}")
+
+    # Define hyperparameters
+    hyperparameters = {
+        "max_depth": "6",
+        "learning_rate": "0.1",
+        "n_estimators": "100",
+        "objective": "reg:squarederror"
+    }
+
+    # Create config directory
+    os.makedirs(config_dir, exist_ok=True)
+
+    # Save hyperparameters
+    hyperparameters_file = os.path.join(config_dir, 'hyperparameters.json')
+    with open(hyperparameters_file, 'w') as f:
+        json.dump(hyperparameters, f)
+
+    print(f"Saved hyperparameters to {hyperparameters_file}")
+    return hyperparameters_file
+
+
+def test_training_container(image_name, temp_dir):
+    """Run the training container with test data and verify outputs."""
+    print(f"\n🔬 Testing training container: {image_name}")
+
+    # Create directory structure to mimic SageMaker
+    input_dir = os.path.join(temp_dir, 'input')
+    data_dir = os.path.join(input_dir, 'data')
+    config_dir = os.path.join(input_dir, 'config')
+    model_dir = os.path.join(temp_dir, 'model')
+    output_dir = os.path.join(temp_dir, 'output')
+
+    os.makedirs(data_dir, exist_ok=True)
+    os.makedirs(config_dir, exist_ok=True)
+    os.makedirs(model_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Create test data and hyperparameters
+    create_test_data(data_dir)
+    create_hyperparameters(config_dir)
+
+    # Run the container
+    print("\n📦 Running training container...")
+
+    cmd = [
+        "docker", "run",
+        "--rm",
+        "-v", f"{temp_dir}:/opt/ml",
+        image_name
+    ]
+
+    try:
+        # Execute the training container
+        subprocess.run(cmd, check=True)
+
+        # Check if model files were created
+        model_files = os.listdir(model_dir)
+        if not model_files:
+            print("❌ Training failed: No model files created")
+            return False
+
+        print(f"✅ Training succeeded! Model files created: {', '.join(model_files)}")
+
+        # Check for specific expected files
+        expected_files = ['model.joblib', 'metadata.json']
+        missing_files = [f for f in expected_files if f not in model_files]
+
+        if missing_files:
+            print(f"⚠️ Warning: Some expected files are missing: {', '.join(missing_files)}")
+        else:
+            print("✅ All expected model files were created")
+
+        return True
+
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Training failed with error code {e.returncode}")
+
+        # Check if there's a failure file with more details
+        failure_file = os.path.join(output_dir, 'failure')
+        if os.path.exists(failure_file):
+            with open(failure_file, 'r') as f:
+                failure_content = f.read()
+            print(f"Error details:\n{failure_content}")
+
+        return False
+
+
+def run_training_test(image_name="aws_model_training:latest"):
+    """Run the training container test with a temporary directory."""
+    print("🚀 Starting training container test")
+
+    # Create temporary directory for training data
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Using temporary directory: {temp_dir}")
+        success = test_training_container(image_name, temp_dir)
+
+    if success:
+        print("\n🎉 Training container test passed!")
+    else:
+        print("\n❌ Training container test failed!")
+
+    return success
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test the AWS model training container")
+    parser.add_argument("--image", default="aws_model_training:latest",
+                        help="Docker image name for the training container")
+
+    args = parser.parse_args()
+    run_training_test(args.image)
\ No newline at end of file
diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile
new file mode 100644
index 000000000..99f6a16f9
--- /dev/null
+++ b/model_docker_images/training/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.12-slim
+
+# Copy requirements file
+COPY requirements.txt /tmp/
+
+# Install dependencies
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+# Set up the program in the image
+COPY train.py /opt/program/
+WORKDIR /opt/program
+
+# Set up the entry point
+ENTRYPOINT ["python", "train.py"]
diff --git a/model_docker_images/training/requirements.txt b/model_docker_images/training/requirements.txt
new file mode 100644
index 000000000..b3b7b18dd
--- /dev/null
+++ b/model_docker_images/training/requirements.txt
@@ -0,0 +1,5 @@
+scikit-learn==1.6.1
+xgboost-cpu==2.1.4
+pandas==2.2.3
+awswrangler==3.11.0
+joblib==1.4.2
\ No newline at end of file
diff --git a/model_docker_images/training/run_training_container.sh b/model_docker_images/training/run_training_container.sh
new file mode 100644
index 000000000..73383fa79
--- /dev/null
+++ b/model_docker_images/training/run_training_container.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+PARENT_DIR="$(dirname "$SCRIPT_DIR")"
+SCRIPTS_DIR="$PARENT_DIR/scripts"
+
+# Make sure test_training.py exists
+if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then
+  echo "❌ Error: test_training.py not found in $SCRIPTS_DIR"
+  exit 1
+fi
+
+IMAGE_NAME=${1:-aws_model_training:latest}
+
+echo "🚀 Testing Training Container: $IMAGE_NAME"
+python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME"
\ No newline at end of file
diff --git a/model_docker_images/training/train.py b/model_docker_images/training/train.py
new file mode 100644
index 000000000..98d97be0b
--- /dev/null
+++ b/model_docker_images/training/train.py
@@ -0,0 +1,154 @@
+import os
+import json
+import sys
+import traceback
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import xgboost as xgb
+import joblib
+
+# SageMaker paths
+prefix = '/opt/ml/'
+input_path = prefix + 'input/data'
+model_path = os.path.join(prefix, 'model')
+param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
+output_path = os.path.join(prefix, 'output')
+
+# Channel names for training and validation data
+training_channel_name = 'train'
+eval_channel_name = 'validation'
+
+
+# Load hyperparameters
+def load_hyperparameters():
+    with open(param_path, 'r') as tc:
+        hyperparameters = json.load(tc)
+
+    # Convert hyperparameters from strings to appropriate types
+    processed_params = {}
+    for key, value in hyperparameters.items():
+        # Try to convert to int, float, or bool as appropriate
+        try:
+            # Convert to int if it looks like an int
+            if value.isdigit() or (value.startswith('-') and value[1:].isdigit()):
+                processed_params[key] = int(value)
+            # Convert to float if it has a decimal point
+            elif '.' in value:
+                try:
+                    processed_params[key] = float(value)
+                except ValueError:
+                    processed_params[key] = value
+            # Handle boolean values
+            elif value.lower() in ['true', 'false']:
+                processed_params[key] = value.lower() == 'true'
+            else:
+                processed_params[key] = value
+        except (AttributeError, ValueError):
+            # If conversion fails, keep as string
+            processed_params[key] = value
+
+    return processed_params
+
+
+# Load training data
+def load_data():
+    train_path = os.path.join(input_path, training_channel_name)
+
+    # Get all CSV files in training directory
+    train_files = [os.path.join(train_path, file) for file in os.listdir(train_path)
+                   if file.endswith('.csv')]
+
+    if not train_files:
+        raise ValueError(f"No CSV files found in {train_path}")
+
+    # Read and concatenate all training files
+    dfs = []
+    for file in train_files:
+        df = pd.read_csv(file)
+        dfs.append(df)
+
+    if not dfs:
+        raise ValueError("No valid data found in training files")
+
+    return pd.concat(dfs, ignore_index=True)
+
+
+# Train the model
+def train():
+    print("Starting the training process")
+
+    try:
+        # Load hyperparameters
+        hyperparameters = load_hyperparameters()
+        print(f"Loaded hyperparameters: {hyperparameters}")
+
+        # Load training data
+        train_data = load_data()
+        print(f"Loaded training data with shape: {train_data.shape}")
+
+        # Extract features and target
+        # Assumes last column is the target
+        X = train_data.iloc[:, :-1]
+        y = train_data.iloc[:, -1]
+
+        # Train/test split
+        X_train, X_val, y_train, y_val = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+
+        # Configure model parameters from hyperparameters or use defaults
+        max_depth = hyperparameters.get('max_depth', 6)
+        learning_rate = hyperparameters.get('learning_rate', 0.1)
+        n_estimators = hyperparameters.get('n_estimators', 100)
+
+        # Create and train model with a simpler approach
+        # Removed early stopping and eval_set to ensure compatibility
+        model = xgb.XGBRegressor(
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators
+        )
+
+        print("Training model...")
+        model.fit(X_train, y_train)
+
+        # Evaluate on validation set
+        val_score = model.score(X_val, y_val)
+        print(f"Validation R² score: {val_score:.4f}")
+
+        # Save the model
+        os.makedirs(model_path, exist_ok=True)
+        model_file = os.path.join(model_path, 'model.joblib')
+
+        # Save additional metadata about the model
+        feature_names = X.columns.tolist()
+        model_metadata = {
+            'feature_names': feature_names,
+            'hyperparameters': hyperparameters,
+            'validation_score': val_score
+        }
+        metadata_file = os.path.join(model_path, 'metadata.json')
+
+        print(f"Saving model to {model_file}")
+        joblib.dump(model, model_file)
+
+        print(f"Saving metadata to {metadata_file}")
+        with open(metadata_file, 'w') as f:
+            json.dump(model_metadata, f)
+
+        print("Training completed successfully")
+
+    except Exception as e:
+        # Write out an error file
+        trc = traceback.format_exc()
+        with open(os.path.join(output_path, 'failure'), 'w') as s:
+            s.write('Exception during training: ' + str(e) + '\n' + trc)
+        # Printing this causes the exception to be in the training job logs
+        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
+        # A non-zero exit code causes the training job to be marked as Failed
+        sys.exit(255)
+
+
+if __name__ == '__main__':
+    train()
\ No newline at end of file

From 8a9a56629f34d7a30c374a44c3b214ab2d7135c0 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Fri, 28 Feb 2025 20:50:41 -0700
Subject: [PATCH 02/35] adding the code/docker for the workbench model image
 generation (WIP)

---
 .../inference/run_inference_container.sh      | 17 -------
 .../inference/test_container.sh               | 50 +++++++++++++++++++
 model_docker_images/scripts/test_training.py  |  1 -
 model_docker_images/training/Dockerfile       | 11 ++--
 .../training/run_training_container.sh        | 17 -------
 .../training/test_container.sh                | 25 ++++++++++
 model_docker_images/training/train.py         |  1 -
 7 files changed, 82 insertions(+), 40 deletions(-)
 delete mode 100755 model_docker_images/inference/run_inference_container.sh
 create mode 100755 model_docker_images/inference/test_container.sh
 delete mode 100644 model_docker_images/training/run_training_container.sh
 create mode 100755 model_docker_images/training/test_container.sh

diff --git a/model_docker_images/inference/run_inference_container.sh b/model_docker_images/inference/run_inference_container.sh
deleted file mode 100755
index e643f3260..000000000
--- a/model_docker_images/inference/run_inference_container.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "🚀 Starting AWS Model Inference Container..."
-docker run -d -p 8080:8080 --name aws_model_test aws_model_image:0.1
-
-echo "⏳ Waiting for server to initialize (5 seconds)..."
-sleep 5
-
-echo "🧪 Running tests against the server..."
-python test_inference.py
-
-echo "🧹 Cleaning up - stopping and removing container..."
-docker stop aws_model_test
-docker rm aws_model_test
-
-echo "✅ Done!"
\ No newline at end of file
diff --git a/model_docker_images/inference/test_container.sh b/model_docker_images/inference/test_container.sh
new file mode 100755
index 000000000..3157b3df7
--- /dev/null
+++ b/model_docker_images/inference/test_container.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -e
+
+# Determine script and project directories
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+SCRIPTS_DIR="$PROJECT_ROOT/scripts"
+
+# Default image name
+DEFAULT_IMAGE="aws_model_inference:0.1"
+IMAGE_NAME=${1:-$DEFAULT_IMAGE}
+
+# Port to use for testing
+PORT=8080
+
+echo "📋 Inference Container Test Script"
+echo "======================================"
+
+# Make sure test script exists
+if [ ! -f "$SCRIPTS_DIR/test_inference.py" ]; then
+  echo "❌ Error: test_inference.py not found in $SCRIPTS_DIR"
+  exit 1
+fi
+
+# Start the inference container with proper log settings
+echo "🚀 Starting inference container: $IMAGE_NAME"
+CONTAINER_ID=$(docker run -d -p $PORT:$PORT -e PYTHONUNBUFFERED=1 "$IMAGE_NAME")
+
+# Follow logs in the background
+docker logs -f $CONTAINER_ID &
+LOGS_PID=$!
+
+# Ensure container and log process are stopped on script exit
+function cleanup {
+  echo "🧹 Stopping log process and container..."
+  kill $LOGS_PID 2>/dev/null || true
+  docker stop $CONTAINER_ID >/dev/null 2>&1
+  docker rm $CONTAINER_ID >/dev/null 2>&1
+}
+trap cleanup EXIT
+
+# Wait for container to initialize
+echo "⏳ Waiting for server to initialize (5 seconds)..."
+sleep 5
+
+# Run the test
+echo "🧪 Testing inference container..."
+python "$SCRIPTS_DIR/test_inference.py" --host localhost --port $PORT
+
+echo "======================================"
\ No newline at end of file
diff --git a/model_docker_images/scripts/test_training.py b/model_docker_images/scripts/test_training.py
index 20956e73a..ecba030a1 100644
--- a/model_docker_images/scripts/test_training.py
+++ b/model_docker_images/scripts/test_training.py
@@ -2,7 +2,6 @@
 import json
 import argparse
 import tempfile
-import shutil
 import subprocess
 import numpy as np
 import pandas as pd
diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile
index 99f6a16f9..74f6caf44 100644
--- a/model_docker_images/training/Dockerfile
+++ b/model_docker_images/training/Dockerfile
@@ -6,9 +6,12 @@ COPY requirements.txt /tmp/
 # Install dependencies
 RUN pip install --no-cache-dir -r /tmp/requirements.txt
 
-# Set up the program in the image
-COPY train.py /opt/program/
+# Copy the SageMaker entrypoint script
+COPY sagemaker_entrypoint.py /opt/program/
 WORKDIR /opt/program
 
-# Set up the entry point
-ENTRYPOINT ["python", "train.py"]
+# Make the entrypoint executable
+RUN chmod +x /opt/program/sagemaker_entrypoint.py
+
+# Set the entrypoint
+ENTRYPOINT ["/opt/program/sagemaker_entrypoint.py"]
\ No newline at end of file
diff --git a/model_docker_images/training/run_training_container.sh b/model_docker_images/training/run_training_container.sh
deleted file mode 100644
index 73383fa79..000000000
--- a/model_docker_images/training/run_training_container.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
-PARENT_DIR="$(dirname "$SCRIPT_DIR")"
-SCRIPTS_DIR="$PARENT_DIR/scripts"
-
-# Make sure test_training.py exists
-if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then
-  echo "❌ Error: test_training.py not found in $SCRIPTS_DIR"
-  exit 1
-fi
-
-IMAGE_NAME=${1:-aws_model_training:latest}
-
-echo "🚀 Testing Training Container: $IMAGE_NAME"
-python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME"
\ No newline at end of file
diff --git a/model_docker_images/training/test_container.sh b/model_docker_images/training/test_container.sh
new file mode 100755
index 000000000..cdc1382b4
--- /dev/null
+++ b/model_docker_images/training/test_container.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -e
+
+# Determine script and project directories
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+SCRIPTS_DIR="$PROJECT_ROOT/scripts"
+
+# Default image name with latest tag
+DEFAULT_IMAGE="aws_model_training:0.1"
+IMAGE_NAME=${1:-$DEFAULT_IMAGE}
+
+echo "📋 Training Container Test Script"
+echo "======================================"
+
+# Make sure test_training.py exists
+if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then
+  echo "❌ Error: test_training.py not found in $SCRIPTS_DIR"
+  exit 1
+fi
+
+echo "🚀 Testing Training Container: $IMAGE_NAME"
+python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME"
+
+echo "======================================"
\ No newline at end of file
diff --git a/model_docker_images/training/train.py b/model_docker_images/training/train.py
index 98d97be0b..d88c2482c 100644
--- a/model_docker_images/training/train.py
+++ b/model_docker_images/training/train.py
@@ -3,7 +3,6 @@
 import sys
 import traceback
 import pandas as pd
-import numpy as np
 from sklearn.model_selection import train_test_split
 import xgboost as xgb
 import joblib

From d25531586893706ab4b653d3df54c8bbdc63031f Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Fri, 28 Feb 2025 20:51:13 -0700
Subject: [PATCH 03/35] adding the code/docker for the workbench model image
 generation (WIP)

---
 .../training/sagemaker_entrypoint.py          | 162 ++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 model_docker_images/training/sagemaker_entrypoint.py

diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
new file mode 100644
index 000000000..50f3acc4f
--- /dev/null
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -0,0 +1,162 @@
+import os
+import sys
+import json
+import tarfile
+import subprocess
+import logging
+import boto3
+from urllib.parse import urlparse
+
+# Set up logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger('sagemaker-entry-point')
+
+
+def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"):
+    """Download and extract code package from S3."""
+    logger.info(f"Downloading source package from {s3_uri}...")
+    parsed = urlparse(s3_uri)
+    bucket = parsed.netloc
+    key = parsed.path.lstrip("/")
+    local_tar = "/tmp/code_package.tar.gz"
+
+    try:
+        s3 = boto3.client("s3")
+        s3.download_file(bucket, key, local_tar)
+        logger.info(f"Download successful, tar file size: {os.path.getsize(local_tar)} bytes")
+
+        os.makedirs(target_dir, exist_ok=True)
+        with tarfile.open(local_tar, "r:gz") as tar:
+            tar.extractall(path=target_dir)
+
+        logger.info(f"Files in {target_dir} after extraction: {os.listdir(target_dir)}")
+        return target_dir
+    except Exception as e:
+        logger.error(f"Error downloading from S3: {str(e)}")
+        sys.exit(1)
+
+
+def install_requirements(requirements_path):
+    """Install Python dependencies from requirements file."""
+    if os.path.exists(requirements_path):
+        logger.info(f"Installing dependencies from {requirements_path}...")
+        try:
+            subprocess.check_call([
+                sys.executable, "-m", "pip", "install", "-r", requirements_path
+            ])
+            logger.info("Requirements installation completed successfully.")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Error installing requirements: {str(e)}")
+            sys.exit(1)
+    else:
+        logger.info(f"No requirements file found at {requirements_path}")
+
+
+def setup_sagemaker_environment():
+    """Set up SageMaker environment variables based on /opt/ml structure."""
+    env_vars = {
+        "SM_MODEL_DIR": "/opt/ml/model",
+        "SM_OUTPUT_DATA_DIR": "/opt/ml/output/data",
+        "SM_CHANNEL_TRAIN": "/opt/ml/input/data/train",
+        "SM_OUTPUT_DIR": "/opt/ml/output",
+        "SM_INPUT_DIR": "/opt/ml/input",
+        "SM_INPUT_CONFIG_DIR": "/opt/ml/input/config"
+    }
+
+    # Set the environment variables
+    for key, value in env_vars.items():
+        os.environ[key] = str(value)
+
+    logger.info(f"Set SageMaker environment variables: {list(env_vars.keys())}")
+
+
+def main():
+    logger.info("Starting SageMaker container entry point")
+
+    # Read hyperparameters
+    hyperparameters_path = '/opt/ml/input/config/hyperparameters.json'
+    if not os.path.exists(hyperparameters_path):
+        logger.error("Error: hyperparameters.json not found!")
+        sys.exit(1)
+
+    with open(hyperparameters_path, 'r') as f:
+        hyperparameters = json.load(f)
+        logger.info(f"Hyperparameters: {hyperparameters}")
+
+    # Set up environment based on hyperparameters
+    # Get program name from hyperparameters or environment variable
+    if 'sagemaker_program' in hyperparameters:
+        program = hyperparameters['sagemaker_program'].strip('"\'')
+        os.environ['SAGEMAKER_PROGRAM'] = program
+    elif 'SAGEMAKER_PROGRAM' in os.environ:
+        program = os.environ['SAGEMAKER_PROGRAM']
+    else:
+        logger.error("Error: sagemaker_program not found in hyperparameters or environment!")
+        sys.exit(1)
+
+    logger.info(f"Using program: {program}")
+
+    # Get source directory from hyperparameters
+    if 'sagemaker_submit_directory' in hyperparameters:
+        s3_source = hyperparameters['sagemaker_submit_directory'].strip('"\'')
+        logger.info(f"Downloading source from: {s3_source}")
+
+        # Download and extract source code
+        submit_dir = download_and_extract_s3(s3_source)
+
+        # Install requirements
+        install_requirements(os.path.join(submit_dir, "requirements.txt"))
+    else:
+        logger.info("No sagemaker_submit_directory specified, assuming code is already in /opt/ml/code")
+        submit_dir = "/opt/ml/code"
+
+        # Check if directory exists
+        if not os.path.exists(submit_dir):
+            logger.error(f"Code directory {submit_dir} does not exist!")
+            sys.exit(1)
+
+        # List code directory contents for debugging
+        logger.info(f"Contents of {submit_dir}:")
+        try:
+            output = subprocess.check_output(['ls', '-la', submit_dir])
+            logger.info(output.decode('utf-8'))
+        except Exception as e:
+            logger.error(f"Failed to list directory: {e}")
+
+    # Set up SageMaker environment variables
+    setup_sagemaker_environment()
+
+    # Ensure directories exist
+    os.makedirs(os.environ["SM_MODEL_DIR"], exist_ok=True)
+    os.makedirs(os.environ["SM_OUTPUT_DATA_DIR"], exist_ok=True)
+
+    # Locate entry point script
+    entry_point = os.path.join(submit_dir, program)
+    if not os.path.exists(entry_point):
+        logger.error(f"Error: Entry point '{entry_point}' not found!")
+        sys.exit(1)
+
+    logger.info(f"Running entry point: {entry_point}")
+    sys.stdout.flush()
+
+    # Execute with proper arguments
+    cmd = [
+        sys.executable, entry_point,
+        "--model-dir", os.environ["SM_MODEL_DIR"],
+        "--output-data-dir", os.environ["SM_OUTPUT_DATA_DIR"],
+        "--train", os.environ["SM_CHANNEL_TRAIN"]
+    ]
+
+    logger.info(f"Executing: {' '.join(cmd)}")
+
+    # Replace current process with the entry point script and arguments
+    try:
+        os.execv(sys.executable, cmd)
+    except Exception as e:
+        logger.error(f"Failed to execute entry point: {e}")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()

From 49f0032b7be3b7e93f812e0b3841d0b77f310626 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:09:45 -0700
Subject: [PATCH 04/35] adding the code/docker for the workbench model image
 generation (WIP)

---
 model_docker_images/inference/Dockerfile      |   3 +
 model_docker_images/scripts/build_deploy.sh   |  41 +-
 model_docker_images/scripts/test_training.py  | 148 -------
 model_docker_images/tests/data/abalone_sm.csv | 100 +++++
 .../tests/example_model_script.py             | 379 ++++++++++++++++++
 model_docker_images/tests/run_tests.sh        |  48 +++
 .../{scripts => tests}/test_inference.py      |   0
 model_docker_images/tests/test_training.py    | 167 ++++++++
 model_docker_images/training/Dockerfile       |   3 +
 .../training/sagemaker_entrypoint.py          |  20 +-
 .../training/test_container.sh                |  25 --
 model_docker_images/training/train.py         | 153 -------
 12 files changed, 719 insertions(+), 368 deletions(-)
 delete mode 100644 model_docker_images/scripts/test_training.py
 create mode 100644 model_docker_images/tests/data/abalone_sm.csv
 create mode 100644 model_docker_images/tests/example_model_script.py
 create mode 100644 model_docker_images/tests/run_tests.sh
 rename model_docker_images/{scripts => tests}/test_inference.py (100%)
 create mode 100644 model_docker_images/tests/test_training.py
 delete mode 100755 model_docker_images/training/test_container.sh
 delete mode 100644 model_docker_images/training/train.py

diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile
index 5130b7831..a09da2460 100644
--- a/model_docker_images/inference/Dockerfile
+++ b/model_docker_images/inference/Dockerfile
@@ -1,5 +1,8 @@
 FROM python:3.12-slim
 
+# Install Vim
+RUN apt-get update && apt-get install -y vim
+
 # Copy requirements file
 COPY requirements.txt /tmp/
 
diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh
index 236dc3dcf..6ca52bdc2 100755
--- a/model_docker_images/scripts/build_deploy.sh
+++ b/model_docker_images/scripts/build_deploy.sh
@@ -74,7 +74,7 @@ deploy_image() {
 
     for REGION in "${REGION_LIST[@]}"; do
         echo "Processing region: ${REGION}"
-        # Construct the ECR repository URL (using your account ID 507740646243)
+        # Construct the ECR repository URL
         ECR_REPO="507740646243.dkr.ecr.${REGION}.amazonaws.com/model_images/${image_name}"
         AWS_ECR_IMAGE="${ECR_REPO}:${tag}"
 
@@ -140,39 +140,6 @@ else
     echo "Inference image: ${INFERENCE_IMAGE}:${IMAGE_VERSION}"
     echo "======================================"
 
-    # Ask if user wants to test the containers
-    read -p "Do you want to test the containers? (y/n) " -n 1 -r
-    echo
-    if [[ $REPLY =~ ^[Yy]$ ]]; then
-        # Test training container
-        echo "======================================"
-        echo "🧪 Testing training container"
-        echo "======================================"
-        python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}"
-
-        # Test inference container
-        echo "======================================"
-        echo "🧪 Testing inference container"
-        echo "======================================"
-
-        # Start the inference container in the background
-        echo "Starting inference container..."
-        CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}")
-
-        # Wait for the container to initialize
-        echo "Waiting for server to initialize (5 seconds)..."
-        sleep 5
-
-        # Run the test
-        python "$SCRIPT_DIR/test_inference.py"
-
-        # Stop and remove the container
-        echo "Stopping inference container..."
-        docker stop $CONTAINER_ID
-        docker rm $CONTAINER_ID
-
-        echo "======================================"
-        echo -e "${GREEN}✅ Testing completed!${NC}"
-        echo "======================================"
-    fi
-fi
\ No newline at end of file
+    # Inform about testing option
+    echo "To test these containers, run: $PROJECT_ROOT/tests/scripts/run_tests.sh ${IMAGE_VERSION}"
+fi
diff --git a/model_docker_images/scripts/test_training.py b/model_docker_images/scripts/test_training.py
deleted file mode 100644
index ecba030a1..000000000
--- a/model_docker_images/scripts/test_training.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import os
-import json
-import argparse
-import tempfile
-import subprocess
-import numpy as np
-import pandas as pd
-
-
-def create_test_data(data_dir, rows=100, cols=5):
-    """Create synthetic training data for testing."""
-    print(f"Creating synthetic training data in {data_dir}")
-
-    # Generate synthetic features and target
-    X = np.random.randn(rows, cols)
-    y = 2 * X[:, 0] + 3 * X[:, 1] - 1.5 * X[:, 2] + 0.5 * X[:, 3] - X[:, 4] + np.random.randn(rows) * 0.1
-
-    # Create dataframe
-    cols = [f"feature_{i}" for i in range(cols)]
-    df = pd.DataFrame(X, columns=cols)
-    df['target'] = y
-
-    # Create train directory
-    train_dir = os.path.join(data_dir, 'train')
-    os.makedirs(train_dir, exist_ok=True)
-
-    # Save to CSV
-    train_file = os.path.join(train_dir, 'train.csv')
-    df.to_csv(train_file, index=False)
-    print(f"Saved {rows} rows of training data to {train_file}")
-
-    return train_file
-
-
-def create_hyperparameters(config_dir):
-    """Create hyperparameters.json file for the training container."""
-    print(f"Creating hyperparameters in {config_dir}")
-
-    # Define hyperparameters
-    hyperparameters = {
-        "max_depth": "6",
-        "learning_rate": "0.1",
-        "n_estimators": "100",
-        "objective": "reg:squarederror"
-    }
-
-    # Create config directory
-    os.makedirs(config_dir, exist_ok=True)
-
-    # Save hyperparameters
-    hyperparameters_file = os.path.join(config_dir, 'hyperparameters.json')
-    with open(hyperparameters_file, 'w') as f:
-        json.dump(hyperparameters, f)
-
-    print(f"Saved hyperparameters to {hyperparameters_file}")
-    return hyperparameters_file
-
-
-def test_training_container(image_name, temp_dir):
-    """Run the training container with test data and verify outputs."""
-    print(f"\n🔬 Testing training container: {image_name}")
-
-    # Create directory structure to mimic SageMaker
-    input_dir = os.path.join(temp_dir, 'input')
-    data_dir = os.path.join(input_dir, 'data')
-    config_dir = os.path.join(input_dir, 'config')
-    model_dir = os.path.join(temp_dir, 'model')
-    output_dir = os.path.join(temp_dir, 'output')
-
-    os.makedirs(data_dir, exist_ok=True)
-    os.makedirs(config_dir, exist_ok=True)
-    os.makedirs(model_dir, exist_ok=True)
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Create test data and hyperparameters
-    create_test_data(data_dir)
-    create_hyperparameters(config_dir)
-
-    # Run the container
-    print("\n📦 Running training container...")
-
-    cmd = [
-        "docker", "run",
-        "--rm",
-        "-v", f"{temp_dir}:/opt/ml",
-        image_name
-    ]
-
-    try:
-        # Execute the training container
-        subprocess.run(cmd, check=True)
-
-        # Check if model files were created
-        model_files = os.listdir(model_dir)
-        if not model_files:
-            print("❌ Training failed: No model files created")
-            return False
-
-        print(f"✅ Training succeeded! Model files created: {', '.join(model_files)}")
-
-        # Check for specific expected files
-        expected_files = ['model.joblib', 'metadata.json']
-        missing_files = [f for f in expected_files if f not in model_files]
-
-        if missing_files:
-            print(f"⚠️ Warning: Some expected files are missing: {', '.join(missing_files)}")
-        else:
-            print("✅ All expected model files were created")
-
-        return True
-
-    except subprocess.CalledProcessError as e:
-        print(f"❌ Training failed with error code {e.returncode}")
-
-        # Check if there's a failure file with more details
-        failure_file = os.path.join(output_dir, 'failure')
-        if os.path.exists(failure_file):
-            with open(failure_file, 'r') as f:
-                failure_content = f.read()
-            print(f"Error details:\n{failure_content}")
-
-        return False
-
-
-def run_training_test(image_name="aws_model_training:latest"):
-    """Run the training container test with a temporary directory."""
-    print("🚀 Starting training container test")
-
-    # Create temporary directory for training data
-    with tempfile.TemporaryDirectory() as temp_dir:
-        print(f"Using temporary directory: {temp_dir}")
-        success = test_training_container(image_name, temp_dir)
-
-    if success:
-        print("\n🎉 Training container test passed!")
-    else:
-        print("\n❌ Training container test failed!")
-
-    return success
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test the AWS model training container")
-    parser.add_argument("--image", default="aws_model_training:latest",
-                        help="Docker image name for the training container")
-
-    args = parser.parse_args()
-    run_training_test(args.image)
\ No newline at end of file
diff --git a/model_docker_images/tests/data/abalone_sm.csv b/model_docker_images/tests/data/abalone_sm.csv
new file mode 100644
index 000000000..0198e6bc8
--- /dev/null
+++ b/model_docker_images/tests/data/abalone_sm.csv
@@ -0,0 +1,100 @@
+sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,class_number_of_rings,auto_id
+M,0.53,0.43,0.135,0.879,0.28,0.2165,0.25,10,3400
+M,0.645,0.49,0.16,1.251,0.5355,0.3345,0.3165,9,2614
+F,0.69,0.545,0.205,1.933,0.7855,0.429,0.498,13,2618
+I,0.55,0.4,0.135,0.717,0.3315,0.1495,0.221,9,3663
+I,0.415,0.33,0.09,0.3595,0.17,0.081,0.09,6,920
+M,0.465,0.36,0.105,0.488,0.188,0.0845,0.19,10,3137
+M,0.59,0.475,0.155,0.857,0.356,0.174,0.28,13,2225
+M,0.52,0.425,0.155,0.7735,0.297,0.123,0.255,17,3271
+M,0.695,0.525,0.175,1.742,0.696,0.389,0.505,12,2621
+F,0.585,0.42,0.155,1.034,0.437,0.2225,0.32,11,3667
+I,0.525,0.385,0.13,0.607,0.2355,0.125,0.195,8,2754
+F,0.675,0.51,0.185,1.473,0.6295,0.3025,0.4245,11,1971
+I,0.435,0.335,0.105,0.3535,0.156,0.05,0.1135,7,3016
+I,0.435,0.345,0.12,0.3215,0.13,0.056,0.1185,7,1844
+I,0.525,0.4,0.125,0.5655,0.2435,0.119,0.175,8,3810
+F,0.52,0.395,0.18,0.64,0.158,0.11,0.245,22,675
+I,0.415,0.315,0.105,0.33,0.1405,0.0705,0.095,6,2508
+I,0.415,0.325,0.115,0.3285,0.1405,0.051,0.106,12,2378
+I,0.575,0.44,0.15,0.983,0.486,0.215,0.239,8,3666
+I,0.55,0.435,0.14,0.7535,0.3285,0.1555,0.2325,10,1314
+M,0.675,0.515,0.15,1.312,0.556,0.2845,0.4115,11,1970
+I,0.43,0.325,0.09,0.425,0.217,0.087,0.095,7,926
+F,0.67,0.54,0.165,1.5015,0.518,0.358,0.505,14,420
+M,0.745,0.565,0.215,1.931,0.896,0.4585,0.5,11,1205
+M,0.57,0.45,0.14,0.9275,0.477,0.1605,0.2515,8,3819
+F,0.605,0.48,0.175,1.1685,0.4815,0.2305,0.356,9,3822
+M,0.48,0.375,0.115,0.6765,0.3205,0.1065,0.17,6,949
+F,0.58,0.45,0.17,0.9705,0.4615,0.232,0.248,9,2908
+I,0.42,0.31,0.095,0.279,0.1255,0.051,0.088,6,1078
+M,0.705,0.56,0.22,1.981,0.8175,0.3085,0.76,14,168
+F,0.59,0.465,0.16,1.1005,0.506,0.2525,0.295,13,2259
+I,0.33,0.25,0.095,0.2085,0.102,0.0395,0.052,7,1220
+F,0.595,0.465,0.155,1.026,0.4645,0.112,0.305,12,1351
+I,0.36,0.275,0.11,0.2335,0.095,0.0525,0.085,10,440
+I,0.46,0.35,0.115,0.4155,0.18,0.098,0.1175,7,1092
+F,0.675,0.52,0.175,1.494,0.7365,0.3055,0.37,9,4100
+F,0.575,0.46,0.165,1.065,0.4985,0.2145,0.2815,8,3454
+F,0.395,0.3,0.105,0.3375,0.1435,0.0755,0.098,12,3323
+M,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,18,32
+I,0.52,0.395,0.125,0.5805,0.2445,0.146,0.165,9,1864
+I,0.585,0.475,0.16,1.0505,0.48,0.234,0.285,10,1342
+M,0.5,0.375,0.15,0.636,0.2535,0.145,0.19,10,690
+I,0.51,0.395,0.155,0.5395,0.2465,0.1085,0.167,8,2650
+I,0.315,0.23,0.08,0.1375,0.0545,0.031,0.0445,5,1217
+F,0.47,0.355,0.13,0.5465,0.2005,0.126,0.185,14,564
+M,0.58,0.47,0.165,1.041,0.54,0.166,0.279,9,3570
+F,0.55,0.425,0.135,0.8515,0.362,0.196,0.27,14,41
+F,0.47,0.36,0.13,0.472,0.182,0.114,0.15,10,304
+I,0.505,0.39,0.15,0.685,0.362,0.131,0.156,8,962
+F,0.55,0.44,0.135,0.8435,0.434,0.1995,0.185,8,2659
+I,0.45,0.345,0.135,0.443,0.1975,0.0875,0.1175,14,571
+I,0.44,0.355,0.165,0.435,0.159,0.105,0.14,16,2402
+M,0.4,0.32,0.095,0.303,0.1335,0.06,0.1,7,51
+I,0.295,0.225,0.09,0.1105,0.0405,0.0245,0.032,7,709
+I,0.445,0.355,0.095,0.3615,0.1415,0.0785,0.12,8,3540
+I,0.47,0.345,0.14,0.4615,0.229,0.1105,0.116,9,1452
+M,0.635,0.525,0.205,1.484,0.55,0.3115,0.43,20,278
+I,0.415,0.315,0.1,0.3645,0.1765,0.0795,0.095,8,2632
+I,0.435,0.335,0.11,0.383,0.1555,0.0675,0.135,12,2374
+F,0.525,0.415,0.15,0.7155,0.2355,0.171,0.27,13,3949
+I,0.55,0.445,0.145,0.783,0.3045,0.157,0.265,11,3036
+F,0.57,0.46,0.17,1.1,0.4125,0.2205,0.38,14,2252
+M,0.515,0.4,0.14,0.6335,0.288,0.145,0.168,9,2020
+F,0.525,0.405,0.115,0.72,0.3105,0.1915,0.2,14,3192
+F,0.565,0.4,0.13,0.6975,0.3075,0.1665,0.18,8,983
+M,0.675,0.515,0.145,1.265,0.6025,0.299,0.325,10,3596
+F,0.37,0.29,0.115,0.25,0.111,0.057,0.075,9,591
+F,0.475,0.365,0.13,0.4805,0.1905,0.114,0.1475,12,2422
+F,0.55,0.415,0.18,1.1655,0.502,0.301,0.311,9,3731
+M,0.6,0.475,0.19,1.0875,0.403,0.2655,0.325,14,336
+F,0.44,0.34,0.14,0.482,0.186,0.1085,0.16,9,205
+I,0.315,0.23,0.0,0.134,0.0575,0.0285,0.3505,6,3996
+I,0.325,0.24,0.07,0.152,0.0565,0.0305,0.054,8,2041
+I,0.47,0.345,0.115,0.4885,0.2005,0.108,0.166,11,603
+F,0.505,0.375,0.18,0.568,0.2325,0.1495,0.17,12,343
+M,0.635,0.49,0.16,1.101,0.534,0.1865,0.3455,10,1389
+M,0.535,0.41,0.135,0.862,0.2855,0.1525,0.32,14,738
+F,0.595,0.435,0.15,0.9,0.4175,0.17,0.265,8,1651
+M,0.515,0.4,0.16,0.8175,0.2515,0.156,0.3,23,2436
+M,0.455,0.35,0.11,0.458,0.2,0.111,0.1305,8,3089
+I,0.42,0.315,0.115,0.355,0.1895,0.065,0.087,6,2047
+M,0.465,0.34,0.105,0.486,0.231,0.1035,0.1225,9,2571
+M,0.72,0.565,0.2,2.1055,1.017,0.363,0.494,12,1527
+F,0.54,0.415,0.15,0.8115,0.3875,0.1875,0.2035,9,2833
+F,0.655,0.455,0.17,1.275,0.583,0.303,0.333,8,3621
+M,0.675,0.525,0.185,1.587,0.6935,0.336,0.395,13,356
+F,0.555,0.43,0.135,0.812,0.4055,0.163,0.2215,9,3494
+M,0.41,0.3,0.1,0.301,0.124,0.069,0.09,9,3362
+I,0.4,0.31,0.1,0.2875,0.1145,0.0635,0.095,10,2320
+I,0.32,0.215,0.095,0.305,0.14,0.067,0.0885,6,2975
+I,0.27,0.205,0.05,0.084,0.03,0.0185,0.029,6,3629
+F,0.625,0.5,0.15,0.953,0.3445,0.2235,0.305,15,495
+M,0.59,0.47,0.15,0.9955,0.481,0.232,0.24,8,1152
+M,0.59,0.465,0.14,1.046,0.4695,0.263,0.263,7,2592
+F,0.54,0.42,0.14,0.805,0.369,0.1725,0.21,11,846
+I,0.28,0.2,0.075,0.1225,0.0545,0.0115,0.035,5,2153
+M,0.575,0.47,0.185,0.985,0.3745,0.2175,0.355,10,1636
+M,0.72,0.6,0.235,2.2385,0.984,0.411,0.621,12,3993
+M,0.655,0.53,0.195,1.388,0.567,0.2735,0.41,13,467
diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py
new file mode 100644
index 000000000..bb736ac7c
--- /dev/null
+++ b/model_docker_images/tests/example_model_script.py
@@ -0,0 +1,379 @@
+# Template Placeholders
+TEMPLATE_PARAMS = {
+    "model_type": "regressor",
+    "target_column": "class_number_of_rings",
+    "feature_list": ['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'auto_id'],
+    "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/training/abalone-regression",
+    "train_all_data": False
+}
+
+# Imports for XGB Model
+import xgboost as xgb
+import awswrangler as wr
+
+# Model Performance Scores
+from sklearn.metrics import (
+    mean_absolute_error,
+    r2_score,
+    root_mean_squared_error,
+    precision_recall_fscore_support,
+    confusion_matrix,
+)
+
+# Classification Encoder
+from sklearn.preprocessing import LabelEncoder
+
+# Scikit Learn Imports
+from sklearn.model_selection import train_test_split
+
+from io import StringIO
+import json
+import argparse
+import joblib
+import os
+import pandas as pd
+from typing import List
+
+
+# Function to check if dataframe is empty
+def check_dataframe(df: pd.DataFrame, df_name: str) -> None:
+    """
+    Check if the provided dataframe is empty and raise an exception if it is.
+
+    Args:
+        df (pd.DataFrame): DataFrame to check
+        df_name (str): Name of the DataFrame
+    """
+    if df.empty:
+        msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***"
+        print(msg)
+        raise ValueError(msg)
+
+
+def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame:
+    """
+    Expands a column in a DataFrame containing a list of probabilities into separate columns.
+
+    Args:
+        df (pd.DataFrame): DataFrame containing a "pred_proba" column
+        class_labels (List[str]): List of class labels
+
+    Returns:
+        pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns
+    """
+
+    # Sanity check
+    proba_column = "pred_proba"
+    if proba_column not in df.columns:
+        raise ValueError('DataFrame does not contain a "pred_proba" column')
+
+    # Construct new column names with '_proba' suffix
+    new_col_names = [f"{label}_proba" for label in class_labels]
+
+    # Expand the proba_column into separate columns for each probability
+    proba_df = pd.DataFrame(df[proba_column].tolist(), columns=new_col_names)
+
+    # Drop the original proba_column and reset the index in prep for the concat
+    df = df.drop(columns=[proba_column])
+    df = df.reset_index(drop=True)
+
+    # Concatenate the new columns with the original DataFrame
+    df = pd.concat([df, proba_df], axis=1)
+    print(df)
+    return df
+
+
+def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame:
+    """
+    Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive).
+    Prioritizes exact case matches first, then falls back to case-insensitive matching if no exact match exists.
+
+    Args:
+        df (pd.DataFrame): The DataFrame with the original columns.
+        model_features (list): The desired list of feature names (mixed case allowed).
+
+    Returns:
+        pd.DataFrame: The DataFrame with renamed columns to match the model's feature names.
+    """
+    # Create a mapping for exact and case-insensitive matching
+    exact_match_set = set(df.columns)
+    column_map = {}
+
+    # Build the case-insensitive map (if we have any duplicate columns, the first one wins)
+    for col in df.columns:
+        lower_col = col.lower()
+        if lower_col not in column_map:
+            column_map[lower_col] = col
+
+    # Create a dictionary for renaming
+    rename_dict = {}
+    for feature in model_features:
+        # Check for an exact match first
+        if feature in exact_match_set:
+            rename_dict[feature] = feature
+
+        # If not an exact match, fall back to case-insensitive matching
+        elif feature.lower() in column_map:
+            rename_dict[column_map[feature.lower()]] = feature
+
+    # Rename the columns in the DataFrame to match the model's feature names
+    return df.rename(columns=rename_dict)
+
+
+if __name__ == "__main__":
+    """The main function is for training the XGBoost model"""
+
+    # Harness Template Parameters
+    target = TEMPLATE_PARAMS["target_column"]
+    feature_list = TEMPLATE_PARAMS["feature_list"]
+    model_type = TEMPLATE_PARAMS["model_type"]
+    model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"]
+    train_all_data = TEMPLATE_PARAMS["train_all_data"]
+    validation_split = 0.2
+
+    # Sagemaker specific arguments. Defaults are set in the environment variables.
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]
+    )
+    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
+    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    args = parser.parse_args()
+
+    # Read the training data into DataFrames
+    training_files = [
+        os.path.join(args.train, file)
+        for file in os.listdir(args.train)
+        if file.endswith(".csv")
+    ]
+    print(f"Training Files: {training_files}")
+
+    # Combine files and read them all into a single pandas dataframe
+    all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files])
+
+    # Check if the dataframe is empty
+    check_dataframe(all_df, "training_df")
+
+    # Features/Target output
+    print(f"Target: {target}")
+    print(f"Features: {str(feature_list)}")
+
+    # Do we want to train on all the data?
+    if train_all_data:
+        print("Training on ALL of the data")
+        df_train = all_df.copy()
+        df_val = all_df.copy()
+
+    # Does the dataframe have a training column?
+    elif "training" in all_df.columns:
+        print("Found training column, splitting data based on training column")
+        df_train = all_df[all_df["training"]].copy()
+        df_val = all_df[~all_df["training"]].copy()
+    else:
+        # Just do a random training Split
+        print("WARNING: No training column found, splitting data with random state=42")
+        df_train, df_val = train_test_split(
+            all_df, test_size=validation_split, random_state=42
+        )
+    print(f"FIT/TRAIN: {df_train.shape}")
+    print(f"VALIDATION: {df_val.shape}")
+
+    # Now spin up our XGB Model
+    if model_type == "classifier":
+        xgb_model = xgb.XGBClassifier()
+
+        # Encode the target column
+        label_encoder = LabelEncoder()
+        df_train[target] = label_encoder.fit_transform(df_train[target])
+        df_val[target] = label_encoder.transform(df_val[target])
+
+    else:
+        xgb_model = xgb.XGBRegressor()
+        label_encoder = None  # We don't need this for regression
+
+    # Grab our Features, Target and Train the Model
+    y = df_train[target]
+    X = df_train[feature_list]
+    xgb_model.fit(X, y)
+
+    # Make Predictions on the Validation Set
+    print(f"Making Predictions on Validation Set...")
+    preds = xgb_model.predict(df_val[feature_list])
+    if model_type == "classifier":
+        # Also get the probabilities for each class
+        print("Processing Probabilities...")
+        probs = xgb_model.predict_proba(df_val[feature_list])
+        df_val["pred_proba"] = [p.tolist() for p in probs]
+
+        # Expand the pred_proba column into separate columns for each class
+        print(df_val.columns)
+        df_val = expand_proba_column(df_val, label_encoder.classes_)
+        print(df_val.columns)
+
+        # Decode the target and prediction labels
+        df_val[target] = label_encoder.inverse_transform(df_val[target])
+        preds = label_encoder.inverse_transform(preds)
+
+    # Save predictions to S3 (just the target, prediction, and '_proba' columns)
+    # Note: Skipping this for our test script
+    """
+    df_val["prediction"] = preds
+    output_columns = [target, "prediction"]
+    output_columns += [col for col in df_val.columns if col.endswith("_proba")]
+    wr.s3.to_csv(
+        df_val[output_columns],
+        path=f"{model_metrics_s3_path}/validation_predictions.csv",
+        index=False,
+    )
+    """
+
+    # Report Performance Metrics
+    if model_type == "classifier":
+        # Get the label names and their integer mapping
+        label_names = label_encoder.classes_
+
+        # Calculate various model performance metrics
+        scores = precision_recall_fscore_support(
+            df_val[target], preds, average=None, labels=label_names
+        )
+
+        # Put the scores into a dataframe
+        score_df = pd.DataFrame(
+            {
+                target: label_names,
+                "precision": scores[0],
+                "recall": scores[1],
+                "fscore": scores[2],
+                "support": scores[3],
+            }
+        )
+
+        # We need to get creative with the Classification Metrics
+        metrics = ["precision", "recall", "fscore", "support"]
+        for t in label_names:
+            for m in metrics:
+                value = score_df.loc[score_df[target] == t, m].iloc[0]
+                print(f"Metrics:{t}:{m} {value}")
+
+        # Compute and output the confusion matrix
+        conf_mtx = confusion_matrix(df_val[target], preds, labels=label_names)
+        for i, row_name in enumerate(label_names):
+            for j, col_name in enumerate(label_names):
+                value = conf_mtx[i, j]
+                print(f"ConfusionMatrix:{row_name}:{col_name} {value}")
+
+    else:
+        # Calculate various model performance metrics (regression)
+        rmse = root_mean_squared_error(df_val[target], preds)
+        mae = mean_absolute_error(df_val[target], preds)
+        r2 = r2_score(df_val[target], preds)
+        print(f"RMSE: {rmse:.3f}")
+        print(f"MAE: {mae:.3f}")
+        print(f"R2: {r2:.3f}")
+        print(f"NumRows: {len(df_val)}")
+
+    # Now save the model to the standard place/name
+    xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json"))
+    if label_encoder:
+        joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib"))
+
+    # Also save the features (this will validate input during predictions)
+    with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp:
+        json.dump(feature_list, fp)
+
+
+def model_fn(model_dir):
+    """Deserialized and return fitted model"""
+
+    # Load our XGBoost model from the model directory
+    model_path = os.path.join(model_dir, "xgb_model.json")
+    with open(model_path, "r") as f:
+        model_json = json.load(f)
+    saved_model_type = json.loads(model_json.get('learner').get('attributes').get('scikit_learn')).get('_estimator_type')
+    if saved_model_type == "classifier":
+        model = xgb.XGBClassifier()
+    elif saved_model_type == "regressor":
+        model = xgb.XGBRegressor()
+    else:
+        msg = f"Model type ({saved_model_type}) not recognized. Expected 'classifier' or 'regressor'"
+        raise ValueError(msg)
+
+    model.load_model(model_path)
+    return model
+
+
+def input_fn(input_data, content_type):
+    """Parse input data and return a DataFrame."""
+    if not input_data:
+        raise ValueError("Empty input data is not supported!")
+
+    # Decode bytes to string if necessary
+    if isinstance(input_data, bytes):
+        input_data = input_data.decode("utf-8")
+
+    if "text/csv" in content_type:
+        return pd.read_csv(StringIO(input_data))
+    elif "application/json" in content_type:
+        return pd.DataFrame(json.loads(input_data))  # Assumes JSON array of records
+    else:
+        raise ValueError(f"{content_type} not supported!")
+
+
+def output_fn(output_df, accept_type):
+    """Supports both CSV and JSON output formats."""
+    if "text/csv" in accept_type:
+        csv_output = output_df.fillna("N/A").to_csv(index=False)  # CSV with N/A for missing values
+        return csv_output, "text/csv"
+    elif "application/json" in accept_type:
+        return output_df.to_json(orient="records"), "application/json"  # JSON array of records (NaNs -> null)
+    else:
+        raise RuntimeError(f"{accept_type} accept type is not supported by this script.")
+
+
+def predict_fn(df, model) -> pd.DataFrame:
+    """Make Predictions with our XGB Model
+
+    Args:
+        df (pd.DataFrame): The input DataFrame
+        model: The model use for predictions
+
+    Returns:
+        pd.DataFrame: The DataFrame with the predictions added
+    """
+
+    # Grab our feature columns (from training)
+    model_dir = os.environ["SM_MODEL_DIR"]
+    with open(os.path.join(model_dir, "feature_columns.json")) as fp:
+        model_features = json.load(fp)
+    print(f"Model Features: {model_features}")
+
+    # Load our Label Encoder if we have one
+    label_encoder = None
+    if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")):
+        label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib"))
+
+    # We're going match features in a case-insensitive manner, accounting for all the permutations
+    # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos")
+    # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos")
+    matched_df = match_features_case_insensitive(df, model_features)
+
+    # Predict the features against our XGB Model
+    predictions = model.predict(matched_df[model_features])
+
+    # If we have a label encoder, decode the predictions
+    if label_encoder:
+        predictions = label_encoder.inverse_transform(predictions)
+
+    # Set the predictions on the DataFrame
+    df["prediction"] = predictions
+
+    # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame
+    if getattr(model, "predict_proba", None):
+        probs = model.predict_proba(matched_df[model_features])
+        df["pred_proba"] = [p.tolist() for p in probs]
+
+        # Expand the pred_proba column into separate columns for each class
+        df = expand_proba_column(df, label_encoder.classes_)
+
+    # All done, return the DataFrame with new columns for the predictions
+    return df
diff --git a/model_docker_images/tests/run_tests.sh b/model_docker_images/tests/run_tests.sh
new file mode 100644
index 000000000..65cbd1514
--- /dev/null
+++ b/model_docker_images/tests/run_tests.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+# Get the directory of this script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+# Get the project root directory
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+# Configuration
+TRAINING_IMAGE="aws_model_training"
+INFERENCE_IMAGE="aws_model_inference"
+IMAGE_VERSION=${1:-"0.1"}
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Test training container
+echo "======================================"
+echo "🧪 Testing training container"
+echo "======================================"
+python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}"
+
+# Test inference container
+echo "======================================"
+echo "🧪 Testing inference container"
+echo "======================================"
+
+# Start the inference container in the background
+echo "Starting inference container..."
+CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}")
+
+# Wait for the container to initialize
+echo "Waiting for server to initialize (5 seconds)..."
+sleep 5
+
+# Run the test
+python "$SCRIPT_DIR/test_inference.py"
+
+# Stop and remove the container
+echo "Stopping inference container..."
+docker stop $CONTAINER_ID
+docker rm $CONTAINER_ID
+
+echo "======================================"
+echo -e "${GREEN}✅ Testing completed!${NC}"
+echo "======================================"
\ No newline at end of file
diff --git a/model_docker_images/scripts/test_inference.py b/model_docker_images/tests/test_inference.py
similarity index 100%
rename from model_docker_images/scripts/test_inference.py
rename to model_docker_images/tests/test_inference.py
diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py
new file mode 100644
index 000000000..f451439d7
--- /dev/null
+++ b/model_docker_images/tests/test_training.py
@@ -0,0 +1,167 @@
+import os
+import json
+import shutil
+import argparse
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+
+def setup_sagemaker_directories():
+    """Create a temporary directory structure that mimics SageMaker's layout."""
+    base_dir = tempfile.mkdtemp(prefix="sagemaker-test-")
+
+    # Create the SageMaker directory structure
+    os.makedirs(f"{base_dir}/input/data/train", exist_ok=True)
+    os.makedirs(f"{base_dir}/input/config", exist_ok=True)
+    os.makedirs(f"{base_dir}/model", exist_ok=True)
+    os.makedirs(f"{base_dir}/output/data", exist_ok=True)
+    os.makedirs(f"{base_dir}/code", exist_ok=True)
+
+    return base_dir
+
+
+def copy_sample_data(base_dir, data_file):
+    """Copy sample data to the training directory."""
+    if not os.path.exists(data_file):
+        raise FileNotFoundError(f"Sample data file not found: {data_file}")
+
+    shutil.copy2(data_file, f"{base_dir}/input/data/train/")
+    print(f"Copied sample data: {data_file} to {base_dir}/input/data/train/")
+
+
+def copy_model_script(base_dir, script_file):
+    """Copy the model script to the code directory."""
+    if not os.path.exists(script_file):
+        raise FileNotFoundError(f"Model script not found: {script_file}")
+
+    shutil.copy2(script_file, f"{base_dir}/code/")
+    print(f"Copied model script: {script_file} to {base_dir}/code/")
+
+    return os.path.basename(script_file)
+
+
+def create_hyperparameters(base_dir, script_name, hyperparams=None):
+    """Create a hyperparameters.json file with SageMaker-specific entries."""
+    if hyperparams is None:
+        hyperparams = {}
+
+    # Add required SageMaker hyperparameters
+    hyperparams["sagemaker_program"] = script_name
+    hyperparams["sagemaker_submit_directory"] = "/opt/ml/code"
+
+    # Write the hyperparameters to a JSON file
+    with open(f"{base_dir}/input/config/hyperparameters.json", "w") as f:
+        json.dump(hyperparams, f)
+
+    print(f"Created hyperparameters.json with script: {script_name}")
+
+
+def run_training_container(base_dir, image_name, script_name):
+    """Run the training container with the proper volume mounts and environment variables."""
+    # Build the Docker command
+    cmd = [
+        "docker", "run", "--rm",
+        "-v", f"{base_dir}/input:/opt/ml/input",
+        "-v", f"{base_dir}/model:/opt/ml/model",
+        "-v", f"{base_dir}/output:/opt/ml/output",
+        "-v", f"{base_dir}/code:/opt/ml/code",
+        "-e", f"SAGEMAKER_PROGRAM={script_name}",
+        "-e", "SM_MODEL_DIR=/opt/ml/model",
+        "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data",
+        "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train",
+        image_name
+    ]
+
+    print(f"Running training container with command: {' '.join(cmd)}")
+
+    start_time = time.time()
+    try:
+        subprocess.run(cmd, check=True)
+        end_time = time.time()
+        print(f"Training completed in {end_time - start_time:.2f} seconds")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error running training container: {e}")
+        return False
+
+
+def check_training_output(base_dir):
+    """Check if the training produced the expected output files."""
+    model_dir = f"{base_dir}/model"
+    output_dir = f"{base_dir}/output"
+
+    # Check if model files were created
+    model_files = os.listdir(model_dir)
+    print(f"Files in model directory: {model_files}")
+
+    # Check for xgb_model.json which should be created by our example script
+    if "xgb_model.json" in model_files and "feature_columns.json" in model_files:
+        print("✅ Training successful! Model files were created.")
+        return True
+    else:
+        print("❌ Training failed! Expected model files were not created.")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test SageMaker training container")
+    parser.add_argument("--image", type=str, required=True, help="Training image name:tag")
+    parser.add_argument("--script", type=str, default="example_model_script.py",
+                        help="Path to the model script to test")
+    parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv",
+                        help="Path to sample data file")
+    args = parser.parse_args()
+
+    # Resolve paths relative to script location
+    script_dir = Path(__file__).parent.absolute()
+    project_root = script_dir.parent
+
+    if not os.path.isabs(args.script):
+        args.script = os.path.join(script_dir, args.script)
+
+    if not os.path.isabs(args.data):
+        args.data = os.path.join(project_root, args.data)
+
+    try:
+        # Setup the SageMaker-like directory structure
+        base_dir = setup_sagemaker_directories()
+        print(f"Created SageMaker test environment at: {base_dir}")
+
+        # Copy the sample data
+        copy_sample_data(base_dir, args.data)
+
+        # Copy the model script and get its basename
+        script_name = copy_model_script(base_dir, args.script)
+
+        # Create hyperparameters.json
+        # You could add more hyperparameters here specific to your model
+        hyperparams = {
+            "model_type": "regressor",
+            "target_column": "rings",
+            "feature_list": '["length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight"]',
+            "train_all_data": "False"
+        }
+        create_hyperparameters(base_dir, script_name, hyperparams)
+
+        # Run the training container
+        success = run_training_container(base_dir, args.image, script_name)
+
+        if success:
+            # Check if training produced expected output
+            check_training_output(base_dir)
+
+        # Cleanup
+        print(f"Temporary files are in: {base_dir}")
+        print("Not removing temporary files for debugging purposes.")
+        # If you want to auto-cleanup, uncomment the following line:
+        # shutil.rmtree(base_dir)
+
+    except Exception as e:
+        print(f"Error during test: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile
index 74f6caf44..02f1c96cd 100644
--- a/model_docker_images/training/Dockerfile
+++ b/model_docker_images/training/Dockerfile
@@ -1,5 +1,8 @@
 FROM python:3.12-slim
 
+# Install Vim
+RUN apt-get update && apt-get install -y vim
+
 # Copy requirements file
 COPY requirements.txt /tmp/
 
diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index 50f3acc4f..671e32319 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import os
 import sys
 import json
@@ -99,11 +100,20 @@ def main():
 
     # Get source directory from hyperparameters
     if 'sagemaker_submit_directory' in hyperparameters:
-        s3_source = hyperparameters['sagemaker_submit_directory'].strip('"\'')
-        logger.info(f"Downloading source from: {s3_source}")
-
-        # Download and extract source code
-        submit_dir = download_and_extract_s3(s3_source)
+        submit_dir_value = hyperparameters['sagemaker_submit_directory'].strip('"\'')
+        logger.info(f"Source directory: {submit_dir_value}")
+
+        # Check if it's an S3 URI or a local path
+        if submit_dir_value.startswith('s3://'):
+            logger.info(f"Downloading source from S3: {submit_dir_value}")
+            submit_dir = download_and_extract_s3(submit_dir_value)
+        else:
+            logger.info(f"Using local source directory: {submit_dir_value}")
+            submit_dir = submit_dir_value
+            # Verify the directory exists
+            if not os.path.exists(submit_dir):
+                logger.error(f"Local directory not found: {submit_dir}")
+                sys.exit(1)
 
         # Install requirements
         install_requirements(os.path.join(submit_dir, "requirements.txt"))
diff --git a/model_docker_images/training/test_container.sh b/model_docker_images/training/test_container.sh
deleted file mode 100755
index cdc1382b4..000000000
--- a/model_docker_images/training/test_container.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-set -e
-
-# Determine script and project directories
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
-SCRIPTS_DIR="$PROJECT_ROOT/scripts"
-
-# Default image name with latest tag
-DEFAULT_IMAGE="aws_model_training:0.1"
-IMAGE_NAME=${1:-$DEFAULT_IMAGE}
-
-echo "📋 Training Container Test Script"
-echo "======================================"
-
-# Make sure test_training.py exists
-if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then
-  echo "❌ Error: test_training.py not found in $SCRIPTS_DIR"
-  exit 1
-fi
-
-echo "🚀 Testing Training Container: $IMAGE_NAME"
-python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME"
-
-echo "======================================"
\ No newline at end of file
diff --git a/model_docker_images/training/train.py b/model_docker_images/training/train.py
deleted file mode 100644
index d88c2482c..000000000
--- a/model_docker_images/training/train.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import os
-import json
-import sys
-import traceback
-import pandas as pd
-from sklearn.model_selection import train_test_split
-import xgboost as xgb
-import joblib
-
-# SageMaker paths
-prefix = '/opt/ml/'
-input_path = prefix + 'input/data'
-model_path = os.path.join(prefix, 'model')
-param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
-output_path = os.path.join(prefix, 'output')
-
-# Channel names for training and validation data
-training_channel_name = 'train'
-eval_channel_name = 'validation'
-
-
-# Load hyperparameters
-def load_hyperparameters():
-    with open(param_path, 'r') as tc:
-        hyperparameters = json.load(tc)
-
-    # Convert hyperparameters from strings to appropriate types
-    processed_params = {}
-    for key, value in hyperparameters.items():
-        # Try to convert to int, float, or bool as appropriate
-        try:
-            # Convert to int if it looks like an int
-            if value.isdigit() or (value.startswith('-') and value[1:].isdigit()):
-                processed_params[key] = int(value)
-            # Convert to float if it has a decimal point
-            elif '.' in value:
-                try:
-                    processed_params[key] = float(value)
-                except ValueError:
-                    processed_params[key] = value
-            # Handle boolean values
-            elif value.lower() in ['true', 'false']:
-                processed_params[key] = value.lower() == 'true'
-            else:
-                processed_params[key] = value
-        except (AttributeError, ValueError):
-            # If conversion fails, keep as string
-            processed_params[key] = value
-
-    return processed_params
-
-
-# Load training data
-def load_data():
-    train_path = os.path.join(input_path, training_channel_name)
-
-    # Get all CSV files in training directory
-    train_files = [os.path.join(train_path, file) for file in os.listdir(train_path)
-                   if file.endswith('.csv')]
-
-    if not train_files:
-        raise ValueError(f"No CSV files found in {train_path}")
-
-    # Read and concatenate all training files
-    dfs = []
-    for file in train_files:
-        df = pd.read_csv(file)
-        dfs.append(df)
-
-    if not dfs:
-        raise ValueError("No valid data found in training files")
-
-    return pd.concat(dfs, ignore_index=True)
-
-
-# Train the model
-def train():
-    print("Starting the training process")
-
-    try:
-        # Load hyperparameters
-        hyperparameters = load_hyperparameters()
-        print(f"Loaded hyperparameters: {hyperparameters}")
-
-        # Load training data
-        train_data = load_data()
-        print(f"Loaded training data with shape: {train_data.shape}")
-
-        # Extract features and target
-        # Assumes last column is the target
-        X = train_data.iloc[:, :-1]
-        y = train_data.iloc[:, -1]
-
-        # Train/test split
-        X_train, X_val, y_train, y_val = train_test_split(
-            X, y, test_size=0.2, random_state=42
-        )
-
-        # Configure model parameters from hyperparameters or use defaults
-        max_depth = hyperparameters.get('max_depth', 6)
-        learning_rate = hyperparameters.get('learning_rate', 0.1)
-        n_estimators = hyperparameters.get('n_estimators', 100)
-
-        # Create and train model with a simpler approach
-        # Removed early stopping and eval_set to ensure compatibility
-        model = xgb.XGBRegressor(
-            max_depth=max_depth,
-            learning_rate=learning_rate,
-            n_estimators=n_estimators
-        )
-
-        print("Training model...")
-        model.fit(X_train, y_train)
-
-        # Evaluate on validation set
-        val_score = model.score(X_val, y_val)
-        print(f"Validation R² score: {val_score:.4f}")
-
-        # Save the model
-        os.makedirs(model_path, exist_ok=True)
-        model_file = os.path.join(model_path, 'model.joblib')
-
-        # Save additional metadata about the model
-        feature_names = X.columns.tolist()
-        model_metadata = {
-            'feature_names': feature_names,
-            'hyperparameters': hyperparameters,
-            'validation_score': val_score
-        }
-        metadata_file = os.path.join(model_path, 'metadata.json')
-
-        print(f"Saving model to {model_file}")
-        joblib.dump(model, model_file)
-
-        print(f"Saving metadata to {metadata_file}")
-        with open(metadata_file, 'w') as f:
-            json.dump(model_metadata, f)
-
-        print("Training completed successfully")
-
-    except Exception as e:
-        # Write out an error file
-        trc = traceback.format_exc()
-        with open(os.path.join(output_path, 'failure'), 'w') as s:
-            s.write('Exception during training: ' + str(e) + '\n' + trc)
-        # Printing this causes the exception to be in the training job logs
-        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
-        # A non-zero exit code causes the training job to be marked as Failed
-        sys.exit(255)
-
-
-if __name__ == '__main__':
-    train()
\ No newline at end of file

From 8874c4348a4eccc211e2ac46ccb1fd98c70a92fe Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:17:18 -0700
Subject: [PATCH 05/35] making a mock_estimator class for testing the training
 image

---
 model_docker_images/tests/test_training.py | 353 +++++++++++++--------
 1 file changed, 217 insertions(+), 136 deletions(-)

diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py
index f451439d7..baa7783d3 100644
--- a/model_docker_images/tests/test_training.py
+++ b/model_docker_images/tests/test_training.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import os
 import json
 import shutil
@@ -8,160 +9,240 @@
 from pathlib import Path
 
 
-def setup_sagemaker_directories():
-    """Create a temporary directory structure that mimics SageMaker's layout."""
-    base_dir = tempfile.mkdtemp(prefix="sagemaker-test-")
-
-    # Create the SageMaker directory structure
-    os.makedirs(f"{base_dir}/input/data/train", exist_ok=True)
-    os.makedirs(f"{base_dir}/input/config", exist_ok=True)
-    os.makedirs(f"{base_dir}/model", exist_ok=True)
-    os.makedirs(f"{base_dir}/output/data", exist_ok=True)
-    os.makedirs(f"{base_dir}/code", exist_ok=True)
-
-    return base_dir
-
-
-def copy_sample_data(base_dir, data_file):
-    """Copy sample data to the training directory."""
-    if not os.path.exists(data_file):
-        raise FileNotFoundError(f"Sample data file not found: {data_file}")
-
-    shutil.copy2(data_file, f"{base_dir}/input/data/train/")
-    print(f"Copied sample data: {data_file} to {base_dir}/input/data/train/")
-
-
-def copy_model_script(base_dir, script_file):
-    """Copy the model script to the code directory."""
-    if not os.path.exists(script_file):
-        raise FileNotFoundError(f"Model script not found: {script_file}")
-
-    shutil.copy2(script_file, f"{base_dir}/code/")
-    print(f"Copied model script: {script_file} to {base_dir}/code/")
-
-    return os.path.basename(script_file)
-
-
-def create_hyperparameters(base_dir, script_name, hyperparams=None):
-    """Create a hyperparameters.json file with SageMaker-specific entries."""
-    if hyperparams is None:
-        hyperparams = {}
-
-    # Add required SageMaker hyperparameters
-    hyperparams["sagemaker_program"] = script_name
-    hyperparams["sagemaker_submit_directory"] = "/opt/ml/code"
-
-    # Write the hyperparameters to a JSON file
-    with open(f"{base_dir}/input/config/hyperparameters.json", "w") as f:
-        json.dump(hyperparams, f)
-
-    print(f"Created hyperparameters.json with script: {script_name}")
-
-
-def run_training_container(base_dir, image_name, script_name):
-    """Run the training container with the proper volume mounts and environment variables."""
-    # Build the Docker command
-    cmd = [
-        "docker", "run", "--rm",
-        "-v", f"{base_dir}/input:/opt/ml/input",
-        "-v", f"{base_dir}/model:/opt/ml/model",
-        "-v", f"{base_dir}/output:/opt/ml/output",
-        "-v", f"{base_dir}/code:/opt/ml/code",
-        "-e", f"SAGEMAKER_PROGRAM={script_name}",
-        "-e", "SM_MODEL_DIR=/opt/ml/model",
-        "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data",
-        "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train",
-        image_name
-    ]
-
-    print(f"Running training container with command: {' '.join(cmd)}")
-
-    start_time = time.time()
-    try:
-        subprocess.run(cmd, check=True)
-        end_time = time.time()
-        print(f"Training completed in {end_time - start_time:.2f} seconds")
-        return True
-    except subprocess.CalledProcessError as e:
-        print(f"Error running training container: {e}")
-        return False
-
-
-def check_training_output(base_dir):
-    """Check if the training produced the expected output files."""
-    model_dir = f"{base_dir}/model"
-    output_dir = f"{base_dir}/output"
-
-    # Check if model files were created
-    model_files = os.listdir(model_dir)
-    print(f"Files in model directory: {model_files}")
-
-    # Check for xgb_model.json which should be created by our example script
-    if "xgb_model.json" in model_files and "feature_columns.json" in model_files:
-        print("✅ Training successful! Model files were created.")
-        return True
-    else:
-        print("❌ Training failed! Expected model files were not created.")
-        return False
+class MockEstimator:
+    """
+    Mock SageMaker Estimator class that simulates the behavior of sagemaker.estimator.Estimator
+    for local testing purposes.
+    """
+
+    def __init__(self,
+                 image_uri,
+                 entry_point=None,
+                 source_dir=None,
+                 hyperparameters=None,
+                 role=None,
+                 instance_type=None,
+                 **kwargs):
+        """
+        Initialize a MockEstimator with the same parameters as a real SageMaker Estimator.
+
+        Args:
+            image_uri (str): The Docker image URI to use for training
+            entry_point (str): The name of the training script
+            source_dir (str): Directory with the training script and any additional files
+            hyperparameters (dict): Hyperparameters for the training job
+            role (str): AWS IAM role (not used in mock)
+            instance_type (str): EC2 instance type (not used in mock)
+            **kwargs: Additional arguments
+        """
+        self.image_uri = image_uri
+        self.entry_point = entry_point
+        self.source_dir = source_dir
+        self.hyperparameters = hyperparameters or {}
+        self.role = role  # Not used in mock
+        self.instance_type = instance_type  # Not used in mock
+        self.kwargs = kwargs
+        self.temp_dir = None
+        self.model_data = None
+
+    def fit(self, inputs, job_name=None, wait=True, logs=True):
+        """
+        Train the model using the input data.
+
+        Args:
+            inputs (dict): Dictionary of input data channels
+            job_name (str): Name for the training job
+            wait (bool): Whether to wait for the job to complete
+            logs (bool): Whether to show the logs
+
+        Returns:
+            self: The estimator itself
+        """
+        print(f"Starting mock training job: {job_name or 'unnamed-job'}")
+
+        try:
+            # Create SageMaker directory structure
+            self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-test-")
+            print(f"Created SageMaker test environment at: {self.temp_dir}")
+
+            # Create the SageMaker directory structure
+            os.makedirs(f"{self.temp_dir}/input/data/train", exist_ok=True)
+            os.makedirs(f"{self.temp_dir}/input/config", exist_ok=True)
+            os.makedirs(f"{self.temp_dir}/model", exist_ok=True)
+            os.makedirs(f"{self.temp_dir}/output/data", exist_ok=True)
+            os.makedirs(f"{self.temp_dir}/code", exist_ok=True)
+
+            # Process input channels and copy data
+            for channel_name, channel_data in inputs.items():
+                channel_dir = f"{self.temp_dir}/input/data/{channel_name}"
+                os.makedirs(channel_dir, exist_ok=True)
+
+                # Assuming channel_data is a local file path for this mock implementation
+                if os.path.isfile(channel_data):
+                    shutil.copy2(channel_data, channel_dir)
+                    print(f"Copied data file: {channel_data} to {channel_dir}")
+                elif os.path.isdir(channel_data):
+                    for file in os.listdir(channel_data):
+                        if file.endswith(".csv"):
+                            shutil.copy2(os.path.join(channel_data, file), channel_dir)
+                            print(f"Copied data file: {os.path.join(channel_data, file)} to {channel_dir}")
+
+            # Copy source files to code directory
+            if self.source_dir and os.path.exists(self.source_dir):
+                # Copy all Python files from source_dir
+                for file in os.listdir(self.source_dir):
+                    if file.endswith(".py"):
+                        shutil.copy2(os.path.join(self.source_dir, file), f"{self.temp_dir}/code")
+                        print(f"Copied source file: {os.path.join(self.source_dir, file)} to {self.temp_dir}/code")
+
+            # Prepare hyperparameters.json
+            # The key SageMaker parameters
+            sagemaker_params = {
+                "sagemaker_program": self.entry_point,
+                "sagemaker_submit_directory": "/opt/ml/code"  # Container path
+            }
+
+            # Combine with user hyperparameters
+            all_hyperparams = {**self.hyperparameters, **sagemaker_params}
+
+            # Write the hyperparameters to a JSON file
+            with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f:
+                json.dump(all_hyperparams, f)
+
+            print(f"Created hyperparameters.json with entry point: {self.entry_point}")
+
+            # Build the Docker command
+            cmd = [
+                "docker", "run", "--rm",
+                "-v", f"{self.temp_dir}/input:/opt/ml/input",
+                "-v", f"{self.temp_dir}/model:/opt/ml/model",
+                "-v", f"{self.temp_dir}/output:/opt/ml/output",
+                "-v", f"{self.temp_dir}/code:/opt/ml/code",
+                "-e", f"SAGEMAKER_PROGRAM={self.entry_point}",
+                "-e", "SM_MODEL_DIR=/opt/ml/model",
+                "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data",
+                "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train"
+            ]
+
+            # Add platform flag for Mac M1/M2/M3 users
+            if os.uname().machine == 'arm64':
+                cmd.insert(2, "--platform")
+                cmd.insert(3, "linux/amd64")
+
+            # Add the image URI
+            cmd.append(self.image_uri)
+
+            print(f"Running training container with command: {' '.join(cmd)}")
+
+            # Run the container
+            start_time = time.time()
+            try:
+                if logs:
+                    # Run with output visible
+                    subprocess.run(cmd, check=True)
+                else:
+                    # Run silently
+                    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+                end_time = time.time()
+                print(f"Training completed in {end_time - start_time:.2f} seconds")
+
+                # Check the output
+                self._check_training_output()
+
+                # Set the model data path (like SageMaker would)
+                self.model_data = f"{self.temp_dir}/model"
+
+                return self
+
+            except subprocess.CalledProcessError as e:
+                print(f"Error running training container: {e}")
+                if e.stdout:
+                    print(f"STDOUT: {e.stdout.decode('utf-8')}")
+                if e.stderr:
+                    print(f"STDERR: {e.stderr.decode('utf-8')}")
+                raise
+
+        except Exception as e:
+            print(f"Error during fit: {e}")
+            raise
+
+    def _check_training_output(self):
+        """Check if the training produced output files in the model directory."""
+        model_dir = f"{self.temp_dir}/model"
+        model_files = os.listdir(model_dir)
+
+        if not model_files:
+            print("❌ Warning: No files found in model directory after training")
+        else:
+            print(f"✅ Found model files: {', '.join(model_files)}")
+
+    def cleanup(self):
+        """Remove temporary directories."""
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            print(f"Cleaning up temporary directory: {self.temp_dir}")
+            shutil.rmtree(self.temp_dir)
+            self.temp_dir = None
 
 
 def main():
+    """Run the test using a MockEstimator."""
     parser = argparse.ArgumentParser(description="Test SageMaker training container")
-    parser.add_argument("--image", type=str, required=True, help="Training image name:tag")
-    parser.add_argument("--script", type=str, default="example_model_script.py",
-                        help="Path to the model script to test")
+    parser.add_argument("--image", type=str, default="aws_model_training:0.1", help="Training image name:tag")
+    parser.add_argument("--entry-point", type=str, default="example_model_script.py",
+                        help="Name of the training script")
+    parser.add_argument("--source-dir", type=str, default="tests/",
+                        help="Directory containing the training script")
     parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv",
-                        help="Path to sample data file")
+                        help="Path to training data file or directory")
+    parser.add_argument("--cleanup", action="store_true", help="Clean up temporary files after test")
     args = parser.parse_args()
 
-    # Resolve paths relative to script location
+    # Handle relative paths
     script_dir = Path(__file__).parent.absolute()
     project_root = script_dir.parent
 
-    if not os.path.isabs(args.script):
-        args.script = os.path.join(script_dir, args.script)
+    if not os.path.isabs(args.source_dir):
+        args.source_dir = os.path.join(project_root, args.source_dir)
 
     if not os.path.isabs(args.data):
         args.data = os.path.join(project_root, args.data)
 
+    print(f"Testing with:")
+    print(f"  Image: {args.image}")
+    print(f"  Entry point: {args.entry_point}")
+    print(f"  Source directory: {args.source_dir}")
+    print(f"  Training data: {args.data}")
+
+    # Create the estimator
+    estimator = MockEstimator(
+        image_uri=args.image,
+        entry_point=args.entry_point,
+        source_dir=args.source_dir,
+        # Common SageMaker instance type for training
+        instance_type="ml.m5.large"
+    )
+
     try:
-        # Setup the SageMaker-like directory structure
-        base_dir = setup_sagemaker_directories()
-        print(f"Created SageMaker test environment at: {base_dir}")
-
-        # Copy the sample data
-        copy_sample_data(base_dir, args.data)
-
-        # Copy the model script and get its basename
-        script_name = copy_model_script(base_dir, args.script)
-
-        # Create hyperparameters.json
-        # You could add more hyperparameters here specific to your model
-        hyperparams = {
-            "model_type": "regressor",
-            "target_column": "rings",
-            "feature_list": '["length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight"]',
-            "train_all_data": "False"
-        }
-        create_hyperparameters(base_dir, script_name, hyperparams)
-
-        # Run the training container
-        success = run_training_container(base_dir, args.image, script_name)
-
-        if success:
-            # Check if training produced expected output
-            check_training_output(base_dir)
-
-        # Cleanup
-        print(f"Temporary files are in: {base_dir}")
-        print("Not removing temporary files for debugging purposes.")
-        # If you want to auto-cleanup, uncomment the following line:
-        # shutil.rmtree(base_dir)
+        # Run training
+        estimator.fit(
+            inputs={"train": args.data},
+            job_name="mock-training-job"
+        )
+        print("📋 MockEstimator training completed successfully")
 
     except Exception as e:
-        print(f"Error during test: {e}")
+        print(f"❌ MockEstimator training failed: {e}")
         raise
 
+    finally:
+        # Clean up if requested
+        if args.cleanup:
+            estimator.cleanup()
+        else:
+            print(f"Temporary files are in: {estimator.temp_dir}")
+            print("Not removing temporary files for debugging purposes.")
+
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 115d86a6c958df1441a7cdd110855ffc9ba26784 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:21:36 -0700
Subject: [PATCH 06/35] just some cleanup

---
 model_docker_images/tests/test_training.py | 25 ++++++----------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py
index baa7783d3..88d6ac3af 100644
--- a/model_docker_images/tests/test_training.py
+++ b/model_docker_images/tests/test_training.py
@@ -121,17 +121,10 @@ def fit(self, inputs, job_name=None, wait=True, logs=True):
                 "-e", f"SAGEMAKER_PROGRAM={self.entry_point}",
                 "-e", "SM_MODEL_DIR=/opt/ml/model",
                 "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data",
-                "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train"
+                "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train",
+                self.image_uri
             ]
 
-            # Add platform flag for Mac M1/M2/M3 users
-            if os.uname().machine == 'arm64':
-                cmd.insert(2, "--platform")
-                cmd.insert(3, "linux/amd64")
-
-            # Add the image URI
-            cmd.append(self.image_uri)
-
             print(f"Running training container with command: {' '.join(cmd)}")
 
             # Run the container
@@ -195,7 +188,7 @@ def main():
                         help="Directory containing the training script")
     parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv",
                         help="Path to training data file or directory")
-    parser.add_argument("--cleanup", action="store_true", help="Clean up temporary files after test")
+    # Removed cleanup argument since we always clean up
     args = parser.parse_args()
 
     # Handle relative paths
@@ -219,7 +212,6 @@ def main():
         image_uri=args.image,
         entry_point=args.entry_point,
         source_dir=args.source_dir,
-        # Common SageMaker instance type for training
         instance_type="ml.m5.large"
     )
 
@@ -236,13 +228,10 @@ def main():
         raise
 
     finally:
-        # Clean up if requested
-        if args.cleanup:
-            estimator.cleanup()
-        else:
-            print(f"Temporary files are in: {estimator.temp_dir}")
-            print("Not removing temporary files for debugging purposes.")
+        # Always clean up temporary files
+        estimator.cleanup()
+        print("Temporary files have been cleaned up.")
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 72dfacf818b1c44a4b110cb0fead603344a4572c Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:22:34 -0700
Subject: [PATCH 07/35] cleanup and simplification

---
 model_docker_images/tests/test_training.py | 199 +++++++--------------
 1 file changed, 60 insertions(+), 139 deletions(-)

diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py
index 88d6ac3af..c7f3b8593 100644
--- a/model_docker_images/tests/test_training.py
+++ b/model_docker_images/tests/test_training.py
@@ -10,108 +10,60 @@
 
 
 class MockEstimator:
-    """
-    Mock SageMaker Estimator class that simulates the behavior of sagemaker.estimator.Estimator
-    for local testing purposes.
-    """
-
-    def __init__(self,
-                 image_uri,
-                 entry_point=None,
-                 source_dir=None,
-                 hyperparameters=None,
-                 role=None,
-                 instance_type=None,
-                 **kwargs):
-        """
-        Initialize a MockEstimator with the same parameters as a real SageMaker Estimator.
-
-        Args:
-            image_uri (str): The Docker image URI to use for training
-            entry_point (str): The name of the training script
-            source_dir (str): Directory with the training script and any additional files
-            hyperparameters (dict): Hyperparameters for the training job
-            role (str): AWS IAM role (not used in mock)
-            instance_type (str): EC2 instance type (not used in mock)
-            **kwargs: Additional arguments
-        """
+    """Mock SageMaker Estimator for local container testing"""
+
+    def __init__(self, image_uri, entry_point=None, source_dir=None, hyperparameters=None, **kwargs):
         self.image_uri = image_uri
         self.entry_point = entry_point
         self.source_dir = source_dir
         self.hyperparameters = hyperparameters or {}
-        self.role = role  # Not used in mock
-        self.instance_type = instance_type  # Not used in mock
-        self.kwargs = kwargs
         self.temp_dir = None
         self.model_data = None
 
-    def fit(self, inputs, job_name=None, wait=True, logs=True):
-        """
-        Train the model using the input data.
-
-        Args:
-            inputs (dict): Dictionary of input data channels
-            job_name (str): Name for the training job
-            wait (bool): Whether to wait for the job to complete
-            logs (bool): Whether to show the logs
-
-        Returns:
-            self: The estimator itself
-        """
+    def fit(self, inputs, job_name=None, logs=True):
+        """Train the model using the input data"""
         print(f"Starting mock training job: {job_name or 'unnamed-job'}")
 
         try:
-            # Create SageMaker directory structure
+            # Set up SageMaker directory structure
             self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-test-")
-            print(f"Created SageMaker test environment at: {self.temp_dir}")
+            print(f"Created test environment at: {self.temp_dir}")
 
-            # Create the SageMaker directory structure
-            os.makedirs(f"{self.temp_dir}/input/data/train", exist_ok=True)
-            os.makedirs(f"{self.temp_dir}/input/config", exist_ok=True)
-            os.makedirs(f"{self.temp_dir}/model", exist_ok=True)
-            os.makedirs(f"{self.temp_dir}/output/data", exist_ok=True)
-            os.makedirs(f"{self.temp_dir}/code", exist_ok=True)
+            # Create directories
+            for path in ['input/data/train', 'input/config', 'model', 'output/data', 'code']:
+                os.makedirs(f"{self.temp_dir}/{path}", exist_ok=True)
 
-            # Process input channels and copy data
+            # Copy data files
             for channel_name, channel_data in inputs.items():
                 channel_dir = f"{self.temp_dir}/input/data/{channel_name}"
                 os.makedirs(channel_dir, exist_ok=True)
 
-                # Assuming channel_data is a local file path for this mock implementation
                 if os.path.isfile(channel_data):
                     shutil.copy2(channel_data, channel_dir)
-                    print(f"Copied data file: {channel_data} to {channel_dir}")
+                    print(f"Copied data: {os.path.basename(channel_data)} to {channel_name} channel")
                 elif os.path.isdir(channel_data):
                     for file in os.listdir(channel_data):
                         if file.endswith(".csv"):
                             shutil.copy2(os.path.join(channel_data, file), channel_dir)
-                            print(f"Copied data file: {os.path.join(channel_data, file)} to {channel_dir}")
 
             # Copy source files to code directory
             if self.source_dir and os.path.exists(self.source_dir):
-                # Copy all Python files from source_dir
                 for file in os.listdir(self.source_dir):
                     if file.endswith(".py"):
                         shutil.copy2(os.path.join(self.source_dir, file), f"{self.temp_dir}/code")
-                        print(f"Copied source file: {os.path.join(self.source_dir, file)} to {self.temp_dir}/code")
+                print(f"Copied source files to code directory")
 
-            # Prepare hyperparameters.json
-            # The key SageMaker parameters
-            sagemaker_params = {
+            # Create hyperparameters.json
+            all_hyperparams = {
+                **self.hyperparameters,
                 "sagemaker_program": self.entry_point,
-                "sagemaker_submit_directory": "/opt/ml/code"  # Container path
+                "sagemaker_submit_directory": "/opt/ml/code"
             }
 
-            # Combine with user hyperparameters
-            all_hyperparams = {**self.hyperparameters, **sagemaker_params}
-
-            # Write the hyperparameters to a JSON file
             with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f:
                 json.dump(all_hyperparams, f)
 
-            print(f"Created hyperparameters.json with entry point: {self.entry_point}")
-
-            # Build the Docker command
+            # Run the container
             cmd = [
                 "docker", "run", "--rm",
                 "-v", f"{self.temp_dir}/input:/opt/ml/input",
@@ -125,113 +77,82 @@ def fit(self, inputs, job_name=None, wait=True, logs=True):
                 self.image_uri
             ]
 
-            print(f"Running training container with command: {' '.join(cmd)}")
-
-            # Run the container
-            start_time = time.time()
-            try:
-                if logs:
-                    # Run with output visible
-                    subprocess.run(cmd, check=True)
-                else:
-                    # Run silently
-                    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
-                end_time = time.time()
-                print(f"Training completed in {end_time - start_time:.2f} seconds")
+            # Add platform flag for Mac M1/M2/M3 users
+            if os.uname().machine == 'arm64':
+                cmd.insert(2, "--platform")
+                cmd.insert(3, "linux/amd64")
 
-                # Check the output
-                self._check_training_output()
-
-                # Set the model data path (like SageMaker would)
-                self.model_data = f"{self.temp_dir}/model"
-
-                return self
-
-            except subprocess.CalledProcessError as e:
-                print(f"Error running training container: {e}")
-                if e.stdout:
-                    print(f"STDOUT: {e.stdout.decode('utf-8')}")
-                if e.stderr:
-                    print(f"STDERR: {e.stderr.decode('utf-8')}")
-                raise
+            print(f"Running training container...")
 
+            start_time = time.time()
+            result = subprocess.run(cmd, check=True, capture_output=not logs)
+            training_time = time.time() - start_time
+            print(f"Training completed in {training_time:.2f} seconds")
+
+            # Check output
+            model_files = os.listdir(f"{self.temp_dir}/model")
+            if model_files:
+                print(f"✅ Model created successfully with files: {', '.join(model_files)}")
+            else:
+                print("⚠️ No model files were created during training")
+
+            return self
+
+        except subprocess.CalledProcessError as e:
+            print(f"❌ Training failed with exit code {e.returncode}")
+            if e.stdout:
+                print(f"STDOUT: {e.stdout.decode('utf-8')}")
+            if e.stderr:
+                print(f"STDERR: {e.stderr.decode('utf-8')}")
+            raise
         except Exception as e:
-            print(f"Error during fit: {e}")
+            print(f"❌ Error during training: {e}")
             raise
 
-    def _check_training_output(self):
-        """Check if the training produced output files in the model directory."""
-        model_dir = f"{self.temp_dir}/model"
-        model_files = os.listdir(model_dir)
-
-        if not model_files:
-            print("❌ Warning: No files found in model directory after training")
-        else:
-            print(f"✅ Found model files: {', '.join(model_files)}")
-
     def cleanup(self):
-        """Remove temporary directories."""
+        """Remove temporary directories"""
         if self.temp_dir and os.path.exists(self.temp_dir):
-            print(f"Cleaning up temporary directory: {self.temp_dir}")
             shutil.rmtree(self.temp_dir)
             self.temp_dir = None
 
 
 def main():
-    """Run the test using a MockEstimator."""
+    """Run the test using a MockEstimator"""
     parser = argparse.ArgumentParser(description="Test SageMaker training container")
     parser.add_argument("--image", type=str, default="aws_model_training:0.1", help="Training image name:tag")
-    parser.add_argument("--entry-point", type=str, default="example_model_script.py",
-                        help="Name of the training script")
-    parser.add_argument("--source-dir", type=str, default="tests/",
-                        help="Directory containing the training script")
-    parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv",
-                        help="Path to training data file or directory")
-    # Removed cleanup argument since we always clean up
+    parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name")
+    parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts")
+    parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path")
     args = parser.parse_args()
 
-    # Handle relative paths
+    # Resolve relative paths
     script_dir = Path(__file__).parent.absolute()
     project_root = script_dir.parent
 
-    if not os.path.isabs(args.source_dir):
-        args.source_dir = os.path.join(project_root, args.source_dir)
-
-    if not os.path.isabs(args.data):
-        args.data = os.path.join(project_root, args.data)
+    source_dir = os.path.join(project_root, args.source_dir) if not os.path.isabs(args.source_dir) else args.source_dir
+    data_path = os.path.join(project_root, args.data) if not os.path.isabs(args.data) else args.data
 
-    print(f"Testing with:")
-    print(f"  Image: {args.image}")
-    print(f"  Entry point: {args.entry_point}")
-    print(f"  Source directory: {args.source_dir}")
-    print(f"  Training data: {args.data}")
+    print(f"Testing with image {args.image}, script {args.entry_point}")
 
-    # Create the estimator
+    # Create and run the estimator
     estimator = MockEstimator(
         image_uri=args.image,
         entry_point=args.entry_point,
-        source_dir=args.source_dir,
-        instance_type="ml.m5.large"
+        source_dir=source_dir
     )
 
     try:
-        # Run training
         estimator.fit(
-            inputs={"train": args.data},
+            inputs={"train": data_path},
             job_name="mock-training-job"
         )
-        print("📋 MockEstimator training completed successfully")
-
+        print("✅ Training completed successfully")
     except Exception as e:
-        print(f"❌ MockEstimator training failed: {e}")
+        print(f"❌ Training failed: {e}")
         raise
-
     finally:
-        # Always clean up temporary files
         estimator.cleanup()
-        print("Temporary files have been cleaned up.")
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 99a6630ec12b9f4f2826db6e923571adfb8b0550 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:25:48 -0700
Subject: [PATCH 08/35] cleanup and simplification

---
 .../training/sagemaker_entrypoint.py          | 100 +++++++-----------
 1 file changed, 36 insertions(+), 64 deletions(-)

diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index 671e32319..807a82ee3 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import os
 import sys
 import json
@@ -9,8 +9,10 @@
 from urllib.parse import urlparse
 
 # Set up logging
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
 logger = logging.getLogger('sagemaker-entry-point')
 
 
@@ -25,16 +27,15 @@ def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"):
     try:
         s3 = boto3.client("s3")
         s3.download_file(bucket, key, local_tar)
-        logger.info(f"Download successful, tar file size: {os.path.getsize(local_tar)} bytes")
+        logger.info(f"Download successful: {os.path.getsize(local_tar)} bytes")
 
         os.makedirs(target_dir, exist_ok=True)
         with tarfile.open(local_tar, "r:gz") as tar:
             tar.extractall(path=target_dir)
 
-        logger.info(f"Files in {target_dir} after extraction: {os.listdir(target_dir)}")
         return target_dir
     except Exception as e:
-        logger.error(f"Error downloading from S3: {str(e)}")
+        logger.error(f"Error downloading from S3: {e}")
         sys.exit(1)
 
 
@@ -46,16 +47,16 @@ def install_requirements(requirements_path):
             subprocess.check_call([
                 sys.executable, "-m", "pip", "install", "-r", requirements_path
             ])
-            logger.info("Requirements installation completed successfully.")
+            logger.info("Requirements installed successfully.")
         except subprocess.CalledProcessError as e:
-            logger.error(f"Error installing requirements: {str(e)}")
+            logger.error(f"Error installing requirements: {e}")
             sys.exit(1)
     else:
         logger.info(f"No requirements file found at {requirements_path}")
 
 
-def setup_sagemaker_environment():
-    """Set up SageMaker environment variables based on /opt/ml structure."""
+def setup_environment():
+    """Set up SageMaker environment variables."""
     env_vars = {
         "SM_MODEL_DIR": "/opt/ml/model",
         "SM_OUTPUT_DATA_DIR": "/opt/ml/output/data",
@@ -65,92 +66,66 @@ def setup_sagemaker_environment():
         "SM_INPUT_CONFIG_DIR": "/opt/ml/input/config"
     }
 
-    # Set the environment variables
     for key, value in env_vars.items():
         os.environ[key] = str(value)
+        os.makedirs(value, exist_ok=True)
 
-    logger.info(f"Set SageMaker environment variables: {list(env_vars.keys())}")
+    logger.info(f"SageMaker environment initialized.")
 
 
 def main():
     logger.info("Starting SageMaker container entry point")
 
-    # Read hyperparameters
-    hyperparameters_path = '/opt/ml/input/config/hyperparameters.json'
-    if not os.path.exists(hyperparameters_path):
-        logger.error("Error: hyperparameters.json not found!")
+    # Load hyperparameters
+    hyperparams_path = '/opt/ml/input/config/hyperparameters.json'
+    if not os.path.exists(hyperparams_path):
+        logger.error("hyperparameters.json not found!")
         sys.exit(1)
 
-    with open(hyperparameters_path, 'r') as f:
-        hyperparameters = json.load(f)
-        logger.info(f"Hyperparameters: {hyperparameters}")
+    with open(hyperparams_path, 'r') as f:
+        hyperparams = json.load(f)
 
-    # Set up environment based on hyperparameters
-    # Get program name from hyperparameters or environment variable
-    if 'sagemaker_program' in hyperparameters:
-        program = hyperparameters['sagemaker_program'].strip('"\'')
+    # Get program name from hyperparameters or environment
+    if 'sagemaker_program' in hyperparams:
+        program = hyperparams['sagemaker_program'].strip('"\'')
         os.environ['SAGEMAKER_PROGRAM'] = program
     elif 'SAGEMAKER_PROGRAM' in os.environ:
         program = os.environ['SAGEMAKER_PROGRAM']
     else:
-        logger.error("Error: sagemaker_program not found in hyperparameters or environment!")
+        logger.error("sagemaker_program not found in hyperparameters or environment!")
         sys.exit(1)
 
     logger.info(f"Using program: {program}")
 
-    # Get source directory from hyperparameters
-    if 'sagemaker_submit_directory' in hyperparameters:
-        submit_dir_value = hyperparameters['sagemaker_submit_directory'].strip('"\'')
-        logger.info(f"Source directory: {submit_dir_value}")
+    # Get source directory
+    submit_dir = "/opt/ml/code"
+    if 'sagemaker_submit_directory' in hyperparams:
+        submit_dir_value = hyperparams['sagemaker_submit_directory'].strip('"\'')
 
-        # Check if it's an S3 URI or a local path
+        # Handle S3 vs local path
         if submit_dir_value.startswith('s3://'):
-            logger.info(f"Downloading source from S3: {submit_dir_value}")
             submit_dir = download_and_extract_s3(submit_dir_value)
         else:
-            logger.info(f"Using local source directory: {submit_dir_value}")
             submit_dir = submit_dir_value
-            # Verify the directory exists
             if not os.path.exists(submit_dir):
                 logger.error(f"Local directory not found: {submit_dir}")
                 sys.exit(1)
 
-        # Install requirements
-        install_requirements(os.path.join(submit_dir, "requirements.txt"))
-    else:
-        logger.info("No sagemaker_submit_directory specified, assuming code is already in /opt/ml/code")
-        submit_dir = "/opt/ml/code"
-
-        # Check if directory exists
-        if not os.path.exists(submit_dir):
-            logger.error(f"Code directory {submit_dir} does not exist!")
-            sys.exit(1)
+    # Install requirements if present
+    install_requirements(os.path.join(submit_dir, "requirements.txt"))
 
-        # List code directory contents for debugging
-        logger.info(f"Contents of {submit_dir}:")
-        try:
-            output = subprocess.check_output(['ls', '-la', submit_dir])
-            logger.info(output.decode('utf-8'))
-        except Exception as e:
-            logger.error(f"Failed to list directory: {e}")
+    # Set up environment variables
+    setup_environment()
 
-    # Set up SageMaker environment variables
-    setup_sagemaker_environment()
-
-    # Ensure directories exist
-    os.makedirs(os.environ["SM_MODEL_DIR"], exist_ok=True)
-    os.makedirs(os.environ["SM_OUTPUT_DATA_DIR"], exist_ok=True)
-
-    # Locate entry point script
+    # Find entry point script
     entry_point = os.path.join(submit_dir, program)
     if not os.path.exists(entry_point):
-        logger.error(f"Error: Entry point '{entry_point}' not found!")
+        logger.error(f"Entry point not found: {entry_point}")
         sys.exit(1)
 
-    logger.info(f"Running entry point: {entry_point}")
-    sys.stdout.flush()
+    logger.info(f"Executing: {program}")
 
-    # Execute with proper arguments
+    # Execute the training script with SageMaker arguments
     cmd = [
         sys.executable, entry_point,
         "--model-dir", os.environ["SM_MODEL_DIR"],
@@ -158,9 +133,6 @@ def main():
         "--train", os.environ["SM_CHANNEL_TRAIN"]
     ]
 
-    logger.info(f"Executing: {' '.join(cmd)}")
-
-    # Replace current process with the entry point script and arguments
     try:
         os.execv(sys.executable, cmd)
     except Exception as e:

From 0a060669619431437a0ce98c993a13e3e3b2290b Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:37:14 -0700
Subject: [PATCH 09/35] removing some old test code

---
 .../inference/test_container.sh               | 50 -------------------
 1 file changed, 50 deletions(-)
 delete mode 100755 model_docker_images/inference/test_container.sh

diff --git a/model_docker_images/inference/test_container.sh b/model_docker_images/inference/test_container.sh
deleted file mode 100755
index 3157b3df7..000000000
--- a/model_docker_images/inference/test_container.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-set -e
-
-# Determine script and project directories
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
-SCRIPTS_DIR="$PROJECT_ROOT/scripts"
-
-# Default image name
-DEFAULT_IMAGE="aws_model_inference:0.1"
-IMAGE_NAME=${1:-$DEFAULT_IMAGE}
-
-# Port to use for testing
-PORT=8080
-
-echo "📋 Inference Container Test Script"
-echo "======================================"
-
-# Make sure test script exists
-if [ ! -f "$SCRIPTS_DIR/test_inference.py" ]; then
-  echo "❌ Error: test_inference.py not found in $SCRIPTS_DIR"
-  exit 1
-fi
-
-# Start the inference container with proper log settings
-echo "🚀 Starting inference container: $IMAGE_NAME"
-CONTAINER_ID=$(docker run -d -p $PORT:$PORT -e PYTHONUNBUFFERED=1 "$IMAGE_NAME")
-
-# Follow logs in the background
-docker logs -f $CONTAINER_ID &
-LOGS_PID=$!
-
-# Ensure container and log process are stopped on script exit
-function cleanup {
-  echo "🧹 Stopping log process and container..."
-  kill $LOGS_PID 2>/dev/null || true
-  docker stop $CONTAINER_ID >/dev/null 2>&1
-  docker rm $CONTAINER_ID >/dev/null 2>&1
-}
-trap cleanup EXIT
-
-# Wait for container to initialize
-echo "⏳ Waiting for server to initialize (5 seconds)..."
-sleep 5
-
-# Run the test
-echo "🧪 Testing inference container..."
-python "$SCRIPTS_DIR/test_inference.py" --host localhost --port $PORT
-
-echo "======================================"
\ No newline at end of file

From 84d1db8289d1080086623c4d2c1ea7f730c4886f Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 12:47:24 -0700
Subject: [PATCH 10/35] refactoring inference entry_point and test harness

---
 model_docker_images/inference/main.py       | 109 +++---
 model_docker_images/tests/test_inference.py | 361 ++++++++++++++++----
 2 files changed, 334 insertions(+), 136 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index 75b369396..0430d93d6 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -2,7 +2,6 @@
 from contextlib import asynccontextmanager
 import os
 import json
-import numpy as np
 import pandas as pd
 import joblib
 import logging
@@ -11,69 +10,55 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-# Model will be accessible globally
+# Global variables for model and metadata
 model = None
 model_metadata = None
 
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Load model on startup
+    """Handle model loading on startup and cleanup on shutdown."""
     global model, model_metadata
-
-    # SageMaker model path
     model_path = os.environ.get('MODEL_PATH', '/opt/ml/model')
+    model_file = os.path.join(model_path, 'model.joblib')
 
     try:
         logger.info(f"Loading model from {model_path}")
-        model_file = os.path.join(model_path, 'model.joblib')
 
         # Check if model file exists
-        if not os.path.exists(model_file):
-            logger.warning(f"Model file not found at {model_file}")
-            # List directory contents for debugging
+        if os.path.exists(model_file):
+            model = joblib.load(model_file)
+            logger.info(f"Model loaded successfully: {type(model)}")
+        else:
+            # Log the error and available files
+            logger.error(f"Model file not found at {model_file}")
             if os.path.exists(model_path):
-                logger.info(f"Contents of {model_path}: {os.listdir(model_path)}")
+                logger.error(f"Contents of {model_path}: {os.listdir(model_path)}")
             else:
-                logger.warning(f"Model directory {model_path} not found")
+                logger.error(f"Model directory {model_path} does not exist")
 
-            # For testing only - create a dummy model
-            logger.warning("Creating a dummy model for testing")
-            import xgboost as xgb
-            model = xgb.XGBRegressor()
-            model.fit(np.array([[1, 2, 3]]), np.array([1]))
-        else:
-            # Load the actual model
-            logger.info(f"Loading model from {model_file}")
-            model = joblib.load(model_file)
-            logger.info(f"Model loaded successfully: {type(model)}")
+            # Fail fast - no fallback for production
+            raise FileNotFoundError(f"Required model file not found: {model_file}")
 
         # Load metadata if available
-        try:
-            metadata_file = os.path.join(model_path, 'metadata.json')
-            if os.path.exists(metadata_file):
-                with open(metadata_file, 'r') as f:
-                    model_metadata = json.load(f)
-                logger.info(f"Loaded model metadata: {model_metadata}")
-            else:
-                logger.warning(f"Metadata file not found at {metadata_file}")
-                model_metadata = {'feature_names': None}
-        except Exception as e:
-            logger.error(f"Error loading model metadata: {e}")
+        metadata_file = os.path.join(model_path, 'metadata.json')
+        if os.path.exists(metadata_file):
+            with open(metadata_file, 'r') as f:
+                model_metadata = json.load(f)
+            logger.info(f"Loaded model metadata")
+        else:
+            logger.info(f"No metadata found, using default")
             model_metadata = {'feature_names': None}
+
     except Exception as e:
         logger.error(f"Error loading model: {e}", exc_info=True)
-        # Provide a fallback model for testing
-        import xgboost as xgb
-        model = xgb.XGBRegressor()
-        model.fit(np.array([[1, 2, 3]]), np.array([1]))
-        model_metadata = {'feature_names': None}
+        # In production, we don't want to create fallback models
+        # Let the container fail to start
+        raise
 
     logger.info("Model initialization complete")
     yield
-
-    # Cleanup on shutdown if needed
-    logger.info("Cleaning up resources")
+    logger.info("Shutting down model server")
 
 
 app = FastAPI(lifespan=lifespan)
@@ -81,7 +66,7 @@ async def lifespan(app: FastAPI):
 
 @app.get('/ping')
 def ping():
-    # SageMaker health check - return 200 if model is loaded
+    """Health check endpoint for SageMaker."""
     if model is not None:
         return Response(status_code=200)
     return Response(status_code=404)
@@ -89,48 +74,34 @@ def ping():
 
 @app.post('/invocations')
 async def invoke(request: Request):
-    logger.info("Received inference request")
+    """Inference endpoint for SageMaker."""
     content_type = request.headers.get('Content-Type', '')
     accept_type = request.headers.get('Accept', '')
 
-    logger.info(f"Content-Type: {content_type}, Accept: {accept_type}")
-
-    # Get the data
-    body = await request.body()
-
     try:
-        # Handle different content types
-        if content_type == 'text/csv':
-            # Parse CSV data
+        # Get request body
+        body = await request.body()
+
+        # Parse input data based on content type
+        if 'text/csv' in content_type:
             s = body.decode('utf-8')
             data = pd.read_csv(pd.StringIO(s), header=None)
-            logger.info(f"Parsed CSV data with shape: {data.shape}")
-        else:
-            # Default to JSON
+        else:  # Default to JSON
             json_str = body.decode('utf-8')
-            logger.info(f"Raw JSON input: {json_str}")
             data_json = json.loads(json_str)
-            logger.info(f"Parsed JSON data: {data_json}")
-            # Convert to DataFrame if it's not already
-            if not isinstance(data_json, pd.DataFrame):
-                data = pd.DataFrame(data_json)
-            else:
-                data = data_json
+            data = pd.DataFrame(data_json) if not isinstance(data_json, pd.DataFrame) else data_json
 
         # Make prediction
-        logger.info(f"Making prediction with data shape: {data.shape}")
         predictions = model.predict(data)
-        logger.info(f"Prediction successful, result shape: {len(predictions) if hasattr(predictions, '__len__') else 'scalar'}")
 
-        # Always return JSON unless explicitly requested as CSV
-        if accept_type == 'text/csv':
+        # Format response based on accept type
+        if 'text/csv' in accept_type:
             result = pd.DataFrame(predictions).to_csv(header=False, index=False)
-            logger.info(f"Returning CSV response: {result}")
             return Response(content=result, media_type='text/csv')
-        else:
-            # Default to JSON for everything else
-            result = json.dumps({'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions)})
-            logger.info(f"Returning JSON response: {result}")
+        else:  # Default to JSON
+            result = json.dumps({
+                'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions)
+            })
             return Response(content=result, media_type='application/json')
 
     except Exception as e:
diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py
index 223aa1dc7..52461c847 100644
--- a/model_docker_images/tests/test_inference.py
+++ b/model_docker_images/tests/test_inference.py
@@ -1,91 +1,318 @@
 #!/usr/bin/env python
-import requests
+import os
 import json
-import argparse
 import time
+import argparse
+import tempfile
+import shutil
+import subprocess
+import requests
+import pandas as pd
+import numpy as np
+from pathlib import Path
 
 
-def test_inference_server(host="localhost", port=8080):
-    """
-    Test the inference server running in the Docker container.
-    """
-    base_url = f"http://{host}:{port}"
+class MockModel:
+    """Mock SageMaker Model class that simulates the behavior of sagemaker.model.Model"""
 
-    # Test 1: Check the health endpoint
-    print("\n🔍 Testing /ping endpoint (health check)...")
-    try:
-        response = requests.get(f"{base_url}/ping", timeout=5)
-        if response.status_code == 200:
-            print("✅ Health check succeeded")
+    def __init__(self, image_uri, model_data=None, role=None, **kwargs):
+        """
+        Initialize a MockModel with parameters similar to a SageMaker Model.
+
+        Args:
+            image_uri (str): The Docker image URI to use for inference
+            model_data (str): Path to model artifacts (S3 URI or local path)
+            role (str): AWS IAM role (not used in mock)
+        """
+        self.image_uri = image_uri
+        self.model_data = model_data
+        self.role = role
+        self.kwargs = kwargs
+        self.temp_dir = None
+        self.container_id = None
+        self.endpoint_url = None
+
+    def register(self, content_types=None, response_types=None, **kwargs):
+        """Mock model registration - just stores the parameters"""
+        self.content_types = content_types or ["application/json"]
+        self.response_types = response_types or ["application/json"]
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        print(f"Mock registered model with content types: {self.content_types}")
+        return self
+
+    def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=None):
+        """
+        Deploy the model to a mock endpoint (local Docker container).
+
+        Args:
+            instance_type (str): SageMaker instance type (ignored)
+            initial_instance_count (int): Number of instances (ignored)
+            endpoint_name (str): Endpoint name for identification
+
+        Returns:
+            MockEndpoint: The deployed endpoint
+        """
+        print(f"Deploying model to endpoint: {endpoint_name or 'default-endpoint'}")
+
+        # Create a temp directory for model data if not provided
+        if self.model_data is None:
+            self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-inference-test-")
+            model_dir = self.temp_dir
+
+            # Create a dummy model
+            print(f"Creating dummy model in {model_dir}")
+            import joblib
+            import xgboost as xgb
+
+            # Train a simple model
+            model = xgb.XGBRegressor(objective='reg:squarederror')
+            X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+            y = np.array([10, 20, 30])
+            model.fit(X, y)
+
+            # Save the model
+            joblib.dump(model, os.path.join(model_dir, 'model.joblib'))
+
+            # Save metadata
+            with open(os.path.join(model_dir, 'metadata.json'), 'w') as f:
+                json.dump({
+                    'feature_names': ['feature1', 'feature2', 'feature3'],
+                    'model_type': 'regression'
+                }, f)
+
+            self.model_data = model_dir
         else:
-            print(f"❌ Health check failed with status code: {response.status_code}")
-            return False
-    except requests.exceptions.RequestException as e:
-        print(f"❌ Health check failed with error: {e}")
-        print("Is the Docker container running on the specified port?")
+            # Use provided model_data
+            model_dir = self.model_data
+
+        # Start the container
+        cmd = [
+            "docker", "run", "-d", "--rm",
+            "-p", "8080:8080",
+            "-v", f"{model_dir}:/opt/ml/model",
+            "-e", "MODEL_PATH=/opt/ml/model",
+        ]
+
+        # Add platform flag for Mac M1/M2/M3 users
+        if os.uname().machine == 'arm64':
+            cmd.insert(2, "--platform")
+            cmd.insert(3, "linux/amd64")
+
+        # Add the image URI
+        cmd.append(self.image_uri)
+
+        print(f"Starting inference container: {' '.join(cmd)}")
+        self.container_id = subprocess.check_output(cmd).decode('utf-8').strip()
+
+        print(f"Waiting for container to initialize...")
+        time.sleep(5)  # Give it time to start
+
+        self.endpoint_url = 'http://localhost:8080'
+        return MockEndpoint(self)
+
+
+class MockEndpoint:
+    """Mock SageMaker Endpoint for local testing"""
+
+    def __init__(self, model):
+        """Initialize with a reference to the model"""
+        self.model = model
+        self.url = model.endpoint_url
+
+    def predict(self, data, initial_args=None):
+        """
+        Makes a prediction using the deployed model.
+
+        Args:
+            data: Input data in format matching content_types
+            initial_args: Additional arguments (ignored)
+
+        Returns:
+            The prediction result
+        """
+        # Default to first registered content type
+        content_type = self.model.content_types[0] if hasattr(self.model, 'content_types') else 'application/json'
+
+        # Format the data according to content type
+        if content_type == 'text/csv':
+            if isinstance(data, pd.DataFrame):
+                payload = data.to_csv(header=False, index=False)
+            elif isinstance(data, (list, np.ndarray)):
+                payload = pd.DataFrame(data).to_csv(header=False, index=False)
+            else:
+                payload = str(data)
+        else:
+            # Default to JSON
+            if isinstance(data, pd.DataFrame):
+                payload = data.to_json(orient='records')
+            elif isinstance(data, (list, np.ndarray)):
+                payload = json.dumps({"instances": data.tolist() if hasattr(data, 'tolist') else data})
+            else:
+                payload = json.dumps(data)
+
+        # Send the request to the container
+        try:
+            response = requests.post(
+                f"{self.url}/invocations",
+                data=payload,
+                headers={"Content-Type": content_type}
+            )
+
+            # Check for errors
+            if response.status_code != 200:
+                raise Exception(f"Prediction failed with status code {response.status_code}: {response.text}")
+
+            # Parse response based on response type
+            if hasattr(self.model, 'response_types') and 'text/csv' in self.model.response_types:
+                # Parse CSV response
+                return pd.read_csv(pd.StringIO(response.text), header=None)
+            else:
+                # Parse JSON response
+                return response.json()
+
+        except Exception as e:
+            print(f"Error during prediction: {e}")
+            raise
+
+    def delete_endpoint(self):
+        """Clean up resources by stopping the container"""
+        print(f"Deleting endpoint (stopping container {self.model.container_id})")
+        if self.model.container_id:
+            subprocess.run(["docker", "stop", self.model.container_id], check=True)
+            self.model.container_id = None
+
+        # Clean up temp directory if needed
+        if self.model.temp_dir and os.path.exists(self.model.temp_dir):
+            print(f"Cleaning up temporary directory: {self.model.temp_dir}")
+            shutil.rmtree(self.model.temp_dir)
+            self.model.temp_dir = None
+
+
+def test_csv_inference(endpoint, test_data=None):
+    """Test inference with CSV data"""
+    print("\nTesting CSV inference...")
+
+    if test_data is None:
+        # Create sample test data
+        test_data = pd.DataFrame([
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0]
+        ])
+
+    try:
+        response = endpoint.predict(test_data)
+        print(f"Prediction response: {response}")
+        print("✅ CSV inference test successful")
+        return True
+    except Exception as e:
+        print(f"❌ CSV inference test failed: {e}")
         return False
 
-    # Test 2: Test the invocations endpoint with simple data
-    print("\n🔍 Testing /invocations endpoint with sample data...")
-    sample_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+
+def test_json_inference(endpoint, test_data=None):
+    """Test inference with JSON data"""
+    print("\nTesting JSON inference...")
+
+    if test_data is None:
+        # Create sample test data
+        test_data = {
+            "instances": [
+                [1.0, 2.0, 3.0],
+                [4.0, 5.0, 6.0]
+            ]
+        }
 
     try:
-        # Test with JSON data
-        response = requests.post(
-            f"{base_url}/invocations",
-            data=json.dumps(sample_data),
-            headers={"Content-Type": "application/json", "Accept": "application/json"},
-            timeout=5
-        )
+        response = endpoint.predict(test_data)
+        print(f"Prediction response: {response}")
+        print("✅ JSON inference test successful")
+        return True
+    except Exception as e:
+        print(f"❌ JSON inference test failed: {e}")
+        return False
 
+
+def test_ping_endpoint(url):
+    """Test the /ping endpoint directly"""
+    print("\nTesting /ping endpoint...")
+    try:
+        response = requests.get(f"{url}/ping")
+        print(f"Response status: {response.status_code}")
         if response.status_code == 200:
-            print("✅ Inference request succeeded")
-            try:
-                # Parse the JSON response
-                result = response.json()
-                print(f"📊 Response: {result}")
-                return True
-            except json.JSONDecodeError as e:
-                print(f"❌ Error parsing response as JSON: {e}")
-                print(f"Raw response: {response.text}")
-                # Try parsing as CSV
-                try:
-                    lines = response.text.strip().split('\n')
-                    values = [float(line) for line in lines]
-                    print(f"📊 CSV Response (converted): {values}")
-                    return True
-                except Exception:
-                    return False
+            print("✅ Ping test successful")
+            return True
         else:
-            print(f"❌ Inference request failed with status code: {response.status_code}")
-            print(f"Response text: {response.text}")
+            print(f"❌ Ping test failed with status {response.status_code}")
             return False
-    except requests.exceptions.RequestException as e:
-        print(f"❌ Inference request failed with error: {e}")
+    except Exception as e:
+        print(f"❌ Ping test error: {e}")
         return False
 
-    print("\n🎉 All tests passed! Your inference server is working correctly.")
-    return True
 
+def main():
+    """Run the test using MockModel and MockEndpoint"""
+    parser = argparse.ArgumentParser(description="Test SageMaker inference container")
+    parser.add_argument("--image", type=str, default="aws_model_inference:0.1", help="Inference image name:tag")
+    parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)")
+    args = parser.parse_args()
 
-def run_docker_command():
-    """
-    Print the docker run command to help the user start the container.
-    """
-    print("\n📋 To run your Docker container, use the following command:")
-    print("docker run -p 8080:8080 aws_model_inference:latest")
-    print("\nThis maps port 8080 from the container to port 8080 on your host machine.")
+    print(f"Testing inference container {args.image}")
 
+    # Create the model and endpoint
+    model = None
+    endpoint = None
+    success = False
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test the AWS model inference server")
-    parser.add_argument("--host", default="localhost", help="Host where the inference server is running")
-    parser.add_argument("--port", type=int, default=8080, help="Port where the inference server is running")
-    parser.add_argument("--docker-cmd", action="store_true", help="Print the docker run command")
+    try:
+        # Create and deploy the model
+        model = MockModel(
+            image_uri=args.image,
+            model_data=args.model_dir,
+            role="mock-role"
+        )
 
-    args = parser.parse_args()
+        # Register the model
+        model.register(
+            content_types=["text/csv", "application/json"],
+            response_types=["text/csv", "application/json"],
+            inference_instances=["ml.t2.medium"],
+            transform_instances=["ml.m5.large"],
+            description="Test model"
+        )
+
+        # Deploy the model
+        endpoint = model.deploy(
+            instance_type="local",
+            initial_instance_count=1,
+            endpoint_name="test-endpoint"
+        )
 
-    if args.docker_cmd:
-        run_docker_command()
+        # Test the /ping endpoint
+        ping_success = test_ping_endpoint(endpoint.url)
 
-    test_inference_server(args.host, args.port)
+        # Test predictions
+        csv_success = test_csv_inference(endpoint)
+        json_success = test_json_inference(endpoint)
+
+        # Overall success
+        success = ping_success and csv_success and json_success
+
+        if success:
+            print("\n✅ All inference tests passed successfully!")
+        else:
+            print("\n❌ Some inference tests failed!")
+
+    except Exception as e:
+        print(f"\n❌ Error during inference testing: {e}")
+    finally:
+        # Clean up resources
+        if endpoint:
+            endpoint.delete_endpoint()
+
+    # Return appropriate exit code
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    exit(main())

From 84378f5d94ee199859f351861cac426525b172ed Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 13:12:24 -0700
Subject: [PATCH 11/35] fixing StringIO imports

---
 model_docker_images/inference/main.py       | 3 ++-
 model_docker_images/tests/test_inference.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index 0430d93d6..c1b15ca11 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -3,6 +3,7 @@
 import os
 import json
 import pandas as pd
+from io import StringIO
 import joblib
 import logging
 
@@ -85,7 +86,7 @@ async def invoke(request: Request):
         # Parse input data based on content type
         if 'text/csv' in content_type:
             s = body.decode('utf-8')
-            data = pd.read_csv(pd.StringIO(s), header=None)
+            data = pd.read_csv(StringIO(s), header=None)
         else:  # Default to JSON
             json_str = body.decode('utf-8')
             data_json = json.loads(json_str)
diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py
index 52461c847..de4d48e25 100644
--- a/model_docker_images/tests/test_inference.py
+++ b/model_docker_images/tests/test_inference.py
@@ -9,7 +9,7 @@
 import requests
 import pandas as pd
 import numpy as np
-from pathlib import Path
+from io import StringIO
 
 
 class MockModel:
@@ -166,7 +166,7 @@ def predict(self, data, initial_args=None):
             # Parse response based on response type
             if hasattr(self.model, 'response_types') and 'text/csv' in self.model.response_types:
                 # Parse CSV response
-                return pd.read_csv(pd.StringIO(response.text), header=None)
+                return pd.read_csv(StringIO(response.text), header=None)
             else:
                 # Parse JSON response
                 return response.json()

From 5c2d99a9d85f0805c75500743433b084e9358dae Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 13:15:20 -0700
Subject: [PATCH 12/35] improved json handling

---
 model_docker_images/inference/main.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index c1b15ca11..1cb104f9c 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -90,7 +90,17 @@ async def invoke(request: Request):
         else:  # Default to JSON
             json_str = body.decode('utf-8')
             data_json = json.loads(json_str)
-            data = pd.DataFrame(data_json) if not isinstance(data_json, pd.DataFrame) else data_json
+
+            # Handle different JSON formats
+            if isinstance(data_json, dict) and "instances" in data_json:
+                # Format: {"instances": [[1,2,3], [4,5,6]]}
+                data = pd.DataFrame(data_json["instances"])
+            elif isinstance(data_json, list) and all(isinstance(item, list) for item in data_json):
+                # Format: [[1,2,3], [4,5,6]]
+                data = pd.DataFrame(data_json)
+            else:
+                # Try to convert to DataFrame
+                data = pd.DataFrame(data_json)
 
         # Make prediction
         predictions = model.predict(data)

From 24c34b2d6a807fc2a7c602f1832732cc3384ffee Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 13:16:15 -0700
Subject: [PATCH 13/35] change test data a bit

---
 model_docker_images/tests/test_inference.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py
index de4d48e25..389e9c664 100644
--- a/model_docker_images/tests/test_inference.py
+++ b/model_docker_images/tests/test_inference.py
@@ -215,13 +215,11 @@ def test_json_inference(endpoint, test_data=None):
     print("\nTesting JSON inference...")
 
     if test_data is None:
-        # Create sample test data
-        test_data = {
-            "instances": [
-                [1.0, 2.0, 3.0],
-                [4.0, 5.0, 6.0]
-            ]
-        }
+        # Create sample test data - use list of lists of floats
+        test_data = [
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0]
+        ]
 
     try:
         response = endpoint.predict(test_data)

From ad4a0636fbec475e0d9d709c38b5ab16d60f62a3 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 13:40:10 -0700
Subject: [PATCH 14/35] changing repo naming

---
 model_docker_images/scripts/build_deploy.sh | 39 ++++++++++++---------
 model_docker_images/tests/test_inference.py |  2 +-
 model_docker_images/tests/test_training.py  |  2 +-
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh
index 6ca52bdc2..d5829f7d5 100755
--- a/model_docker_images/scripts/build_deploy.sh
+++ b/model_docker_images/scripts/build_deploy.sh
@@ -6,11 +6,18 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
 # Get the parent directory (project root)
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 
-# Configuration
+# AWS Account ID
+AWS_ACCOUNT_ID="507740646243"
+
+# Define repository names - used for both local and ECR images
+TRAINING_REPO="aws-ml-images/py312-sklearn-xgb-training"
+INFERENCE_REPO="aws-ml-images/py312-sklearn-xgb-inference"
+
+# Local directories
 TRAINING_DIR="$PROJECT_ROOT/training"
 INFERENCE_DIR="$PROJECT_ROOT/inference"
-TRAINING_IMAGE="aws_model_training"
-INFERENCE_IMAGE="aws_model_inference"
+
+# Image version
 IMAGE_VERSION=${1:-"0.1"}
 
 # Expect AWS_PROFILE to be set in the environment when deploying
@@ -45,9 +52,9 @@ done
 # Function to build a Docker image
 build_image() {
     local dir=$1
-    local image_name=$2
+    local repo_name=$2
     local tag=$3
-    local full_name="${image_name}:${tag}"
+    local full_name="${repo_name}:${tag}"
 
     echo -e "${YELLOW}Building image: ${full_name}${NC}"
 
@@ -67,20 +74,20 @@ build_image() {
 
 # Function to deploy an image to ECR
 deploy_image() {
-    local image_name=$1
+    local repo_name=$1
     local tag=$2
     local use_latest=$3
-    local full_name="${image_name}:${tag}"
+    local full_name="${repo_name}:${tag}"
 
     for REGION in "${REGION_LIST[@]}"; do
         echo "Processing region: ${REGION}"
         # Construct the ECR repository URL
-        ECR_REPO="507740646243.dkr.ecr.${REGION}.amazonaws.com/model_images/${image_name}"
+        ECR_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${repo_name}"
         AWS_ECR_IMAGE="${ECR_REPO}:${tag}"
 
         echo "Logging in to AWS ECR in ${REGION}..."
         aws ecr get-login-password --region ${REGION} --profile ${AWS_PROFILE} | \
-            docker login --username AWS --password-stdin ${ECR_REPO}
+            docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com"
 
         echo "Tagging image for AWS ECR as ${AWS_ECR_IMAGE}..."
         docker tag ${full_name} ${AWS_ECR_IMAGE}
@@ -102,13 +109,13 @@ deploy_image() {
 echo "======================================"
 echo "🏗️  Building training container"
 echo "======================================"
-build_image "$TRAINING_DIR" "$TRAINING_IMAGE" "$IMAGE_VERSION"
+build_image "$TRAINING_DIR" "$TRAINING_REPO" "$IMAGE_VERSION"
 
 # Build inference image
 echo "======================================"
 echo "🏗️  Building inference container"
 echo "======================================"
-build_image "$INFERENCE_DIR" "$INFERENCE_IMAGE" "$IMAGE_VERSION"
+build_image "$INFERENCE_DIR" "$INFERENCE_REPO" "$IMAGE_VERSION"
 
 echo "======================================"
 echo -e "${GREEN}✅ All builds completed successfully!${NC}"
@@ -121,11 +128,11 @@ if [ "$DEPLOY" = true ]; then
 
     # Deploy training image
     echo "Deploying training image..."
-    deploy_image "$TRAINING_IMAGE" "$IMAGE_VERSION" "$LATEST"
+    deploy_image "$TRAINING_REPO" "$IMAGE_VERSION" "$LATEST"
 
     # Deploy inference image
     echo "Deploying inference image..."
-    deploy_image "$INFERENCE_IMAGE" "$IMAGE_VERSION" "$LATEST"
+    deploy_image "$INFERENCE_REPO" "$IMAGE_VERSION" "$LATEST"
 
     echo "======================================"
     echo -e "${GREEN}✅ Deployment complete!${NC}"
@@ -136,10 +143,10 @@ else
     # Print information about the built images
     echo "======================================"
     echo "📋 Image information:"
-    echo "Training image: ${TRAINING_IMAGE}:${IMAGE_VERSION}"
-    echo "Inference image: ${INFERENCE_IMAGE}:${IMAGE_VERSION}"
+    echo "Training image: ${TRAINING_REPO}:${IMAGE_VERSION}"
+    echo "Inference image: ${INFERENCE_REPO}:${IMAGE_VERSION}"
     echo "======================================"
 
     # Inform about testing option
-    echo "To test these containers, run: $PROJECT_ROOT/tests/scripts/run_tests.sh ${IMAGE_VERSION}"
+    echo "To test these containers, run: $PROJECT_ROOT/tests/run_tests.sh ${IMAGE_VERSION}"
 fi
diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py
index 389e9c664..30262b4e2 100644
--- a/model_docker_images/tests/test_inference.py
+++ b/model_docker_images/tests/test_inference.py
@@ -251,7 +251,7 @@ def test_ping_endpoint(url):
 def main():
     """Run the test using MockModel and MockEndpoint"""
     parser = argparse.ArgumentParser(description="Test SageMaker inference container")
-    parser.add_argument("--image", type=str, default="aws_model_inference:0.1", help="Inference image name:tag")
+    parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag")
     parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)")
     args = parser.parse_args()
 
diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py
index c7f3b8593..74562bf03 100644
--- a/model_docker_images/tests/test_training.py
+++ b/model_docker_images/tests/test_training.py
@@ -119,7 +119,7 @@ def cleanup(self):
 def main():
     """Run the test using a MockEstimator"""
     parser = argparse.ArgumentParser(description="Test SageMaker training container")
-    parser.add_argument("--image", type=str, default="aws_model_training:0.1", help="Training image name:tag")
+    parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag")
     parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name")
     parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts")
     parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path")

From 091fc3d37b8631d9568c773922879fa124460692 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 13:51:31 -0700
Subject: [PATCH 15/35] changing InferenceImage to ModelImages

---
 src/workbench/core/artifacts/model_core.py    | 23 +++++++++++++++----
 .../features_to_model/features_to_model.py    |  6 ++---
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py
index fcbe791a7..710e26b4c 100644
--- a/src/workbench/core/artifacts/model_core.py
+++ b/src/workbench/core/artifacts/model_core.py
@@ -35,10 +35,24 @@ class ModelType(Enum):
     UNKNOWN = "unknown"
 
 
-class InferenceImage:
+class ModelImages:
     """Class for retrieving locked Scikit-Learn inference images"""
 
     image_uris = {
+        ("us-east-1", "training", "0.1"): (
+            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
+        ),
+        ("us-east-1", "inference", "0.1"): (
+            "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
+        ),
+        ("us-west-2", "training", "0.1"): (
+            "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
+        ),
+        ("us-west-2", "inference", "0.1"): (
+            "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
+        ),
+
+        # These are the OLD locked SKLearn images
         ("us-east-1", "sklearn", "1.2.1"): (
             "683313688378.dkr.ecr.us-east-1.amazonaws.com/"
             "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd"
@@ -55,16 +69,17 @@ class InferenceImage:
             "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
             "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd"
         ),
+
     }
 
     @classmethod
-    def get_image_uri(cls, region, framework, version):
-        key = (region, framework, version)
+    def get_image_uri(cls, region, image_type="training", version="0.1"):
+        key = (region, image_type, version)
         if key in cls.image_uris:
             return cls.image_uris[key]
         else:
             raise ValueError(
-                f"No matching image found for region: {region}, framework: {framework}, version: {version}"
+                f"No matching image found for region: {region}, image_type: {image_type}, version: {version}"
             )
 
 
diff --git a/src/workbench/core/transforms/features_to_model/features_to_model.py b/src/workbench/core/transforms/features_to_model/features_to_model.py
index 0fdc1c64d..fe95799e0 100644
--- a/src/workbench/core/transforms/features_to_model/features_to_model.py
+++ b/src/workbench/core/transforms/features_to_model/features_to_model.py
@@ -8,7 +8,7 @@
 # Local Imports
 from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
 from workbench.core.artifacts.feature_set_core import FeatureSetCore
-from workbench.core.artifacts.model_core import ModelCore, ModelType, InferenceImage
+from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
 from workbench.core.artifacts.artifact import Artifact
 from workbench.model_scripts.script_generation import generate_model_script
 from workbench.utils.model_utils import supported_instance_types
@@ -208,7 +208,7 @@ def transform_impl(
         source_dir = str(Path(script_path).parent)
 
         # Create a Sagemaker Model with our script
-        image = InferenceImage.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1")
         self.estimator = SKLearn(
             entry_point=entry_point,
             source_dir=source_dir,
@@ -268,7 +268,7 @@ def create_and_register_model(self):
         )
 
         # Register our model
-        image = InferenceImage.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1")
         self.log.important(f"Registering model {self.output_uuid} with image {image}...")
         model = self.estimator.create_model(role=self.workbench_role_arn)
         model.register(

From 4b837cfdc7d4cb2694bbc12e6578a9a35c094e85 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 14:10:46 -0700
Subject: [PATCH 16/35] using new model images

---
 src/workbench/core/artifacts/model_core.py           |  4 ++--
 .../features_to_model/features_to_model.py           | 12 ++++++------
 .../model_scripts/light_xgb_model/requirements.txt   |  5 +++--
 .../model_scripts/light_xgb_model/xgb_model.template |  4 ++--
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py
index 710e26b4c..50779831a 100644
--- a/src/workbench/core/artifacts/model_core.py
+++ b/src/workbench/core/artifacts/model_core.py
@@ -46,10 +46,10 @@ class ModelImages:
             "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
         ),
         ("us-west-2", "training", "0.1"): (
-            "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
+            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1"
         ),
         ("us-west-2", "inference", "0.1"): (
-            "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
+            "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
         ),
 
         # These are the OLD locked SKLearn images
diff --git a/src/workbench/core/transforms/features_to_model/features_to_model.py b/src/workbench/core/transforms/features_to_model/features_to_model.py
index fe95799e0..e5916994f 100644
--- a/src/workbench/core/transforms/features_to_model/features_to_model.py
+++ b/src/workbench/core/transforms/features_to_model/features_to_model.py
@@ -1,7 +1,7 @@
 """FeaturesToModel: Train/Create a Model from a Feature Set"""
 
 from pathlib import Path
-from sagemaker.sklearn.estimator import SKLearn
+from sagemaker.estimator import Estimator
 import awswrangler as wr
 from datetime import datetime, timezone
 
@@ -111,6 +111,7 @@ def transform_impl(
             all_columns = feature_set.columns
             filter_list = [
                 "id",
+                "auto_id",
                 "__index_level_0__",
                 "write_time",
                 "api_invocation_time",
@@ -208,14 +209,14 @@ def transform_impl(
         source_dir = str(Path(script_path).parent)
 
         # Create a Sagemaker Model with our script
-        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1")
-        self.estimator = SKLearn(
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "training", "0.1")
+        self.estimator = Estimator(
             entry_point=entry_point,
             source_dir=source_dir,
             role=self.workbench_role_arn,
+            instance_count=1,
             instance_type="ml.m5.large",
             sagemaker_session=self.sm_session,
-            framework_version="1.2-1",
             image_uri=image,
             metric_definitions=metric_definitions,
         )
@@ -268,12 +269,11 @@ def create_and_register_model(self):
         )
 
         # Register our model
-        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1")
+        image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "inference", "0.1")
         self.log.important(f"Registering model {self.output_uuid} with image {image}...")
         model = self.estimator.create_model(role=self.workbench_role_arn)
         model.register(
             model_package_group_name=self.output_uuid,
-            framework_version="1.2.1",
             image_uri=image,
             content_types=["text/csv"],
             response_types=["text/csv"],
diff --git a/src/workbench/model_scripts/light_xgb_model/requirements.txt b/src/workbench/model_scripts/light_xgb_model/requirements.txt
index 25a034855..7ff58e74d 100644
--- a/src/workbench/model_scripts/light_xgb_model/requirements.txt
+++ b/src/workbench/model_scripts/light_xgb_model/requirements.txt
@@ -1,2 +1,3 @@
-xgboost==2.0.3
-awswrangler==3.8.0
\ No newline at end of file
+xgboost-cpu==2.1.4
+pandas==2.2.3
+awswrangler==3.11.0
\ No newline at end of file
diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template
index a534b2164..9c53a4d90 100644
--- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template
+++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template
@@ -15,7 +15,7 @@ import awswrangler as wr
 from sklearn.metrics import (
     mean_absolute_error,
     r2_score,
-    mean_squared_error,
+    root_mean_squared_error,
     precision_recall_fscore_support,
     confusion_matrix,
 )
@@ -261,7 +261,7 @@ if __name__ == "__main__":
 
     else:
         # Calculate various model performance metrics (regression)
-        rmse = mean_squared_error(df_val[target], preds, squared=False)
+        rmse = root_mean_squared_error(df_val[target], preds)
         mae = mean_absolute_error(df_val[target], preds)
         r2 = r2_score(df_val[target], preds)
         print(f"RMSE: {rmse:.3f}")

From 2a80c66a4cdaf6b74f7539a18f9d35ac14935b61 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 14:13:47 -0700
Subject: [PATCH 17/35] unlocking scikit-learn version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ef8494177..85085806e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
     "cryptography >= 42.0.5",
     "ipython >= 8.17.2",
     "pyreadline3; sys_platform == 'win32'",
-    "scikit-learn >=1.4.2, <= 1.5.2",
+    "scikit-learn >=1.5.2",
     "joblib >= 1.3.2",
     "requests >= 2.26.0",
     "rdkit>=2024.3.2",

From 1f6299cd6e8b485d6ed55e1536243c15c3047ca2 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 15:00:48 -0700
Subject: [PATCH 18/35] switching over to 'serve' script

---
 model_docker_images/inference/Dockerfile | 20 +++++++++++++++-----
 model_docker_images/inference/serve      |  6 ++++++
 2 files changed, 21 insertions(+), 5 deletions(-)
 create mode 100644 model_docker_images/inference/serve

diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile
index a09da2460..6433484bd 100644
--- a/model_docker_images/inference/Dockerfile
+++ b/model_docker_images/inference/Dockerfile
@@ -9,9 +9,19 @@ COPY requirements.txt /tmp/
 # Install dependencies
 RUN pip install --no-cache-dir -r /tmp/requirements.txt
 
-# Copy your server code
-COPY main.py /app/
-WORKDIR /app
+# Add the serve script
+COPY serve /usr/local/bin/
+RUN chmod +x /usr/local/bin/serve
 
-# Run the API server
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
\ No newline at end of file
+# Copy the main.py/entrypoint script
+COPY main.py /opt/program/
+WORKDIR /opt/program
+
+# Make port 8080 available for the web server
+EXPOSE 8080
+
+# Define environment variable
+ENV PYTHONUNBUFFERED=TRUE
+
+# SageMaker will look for this
+CMD ["serve"]
\ No newline at end of file
diff --git a/model_docker_images/inference/serve b/model_docker_images/inference/serve
new file mode 100644
index 000000000..93d3d58fd
--- /dev/null
+++ b/model_docker_images/inference/serve
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# SageMaker expect a 'serve' script to be found in the container which starts the model server.
+
+# Start the FastAPI server using Uvicorn
+exec uvicorn main:app --host 0.0.0.0 --port 8080
\ No newline at end of file

From 1cb2186f58dc17f204f26ea91959cde11f792f0e Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 15:01:30 -0700
Subject: [PATCH 19/35] cleaning up the requirements.txt files for models since
 our new training/inference images include these pacakges

---
 .../model_scripts/custom_models/chem_info/requirements.txt    | 4 +---
 .../model_scripts/custom_script_example/requirements.txt      | 2 --
 .../model_scripts/light_quant_regression/requirements.txt     | 2 --
 .../model_scripts/light_scikit_learn/requirements.txt         | 4 +---
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/workbench/model_scripts/custom_models/chem_info/requirements.txt b/src/workbench/model_scripts/custom_models/chem_info/requirements.txt
index 33ff11c23..68cb66c0f 100644
--- a/src/workbench/model_scripts/custom_models/chem_info/requirements.txt
+++ b/src/workbench/model_scripts/custom_models/chem_info/requirements.txt
@@ -1,4 +1,2 @@
-scikit-learn==1.3.2  # Note: This is the highest version that works with SageMaker/scikit-learn framework
-awswrangler>=3.8.0
-rdkit>=2024.3.2
+rdkit>=2024.9.5
 mordredcommunity>=2.0.6
\ No newline at end of file
diff --git a/src/workbench/model_scripts/custom_script_example/requirements.txt b/src/workbench/model_scripts/custom_script_example/requirements.txt
index 2b1dd27fd..e69de29bb 100644
--- a/src/workbench/model_scripts/custom_script_example/requirements.txt
+++ b/src/workbench/model_scripts/custom_script_example/requirements.txt
@@ -1,2 +0,0 @@
-scikit-learn==1.3.2  # Note: This is the highest version that works with SageMaker/scikit-learn framework
-awswrangler>=3.8.0
diff --git a/src/workbench/model_scripts/light_quant_regression/requirements.txt b/src/workbench/model_scripts/light_quant_regression/requirements.txt
index 25a034855..e69de29bb 100644
--- a/src/workbench/model_scripts/light_quant_regression/requirements.txt
+++ b/src/workbench/model_scripts/light_quant_regression/requirements.txt
@@ -1,2 +0,0 @@
-xgboost==2.0.3
-awswrangler==3.8.0
\ No newline at end of file
diff --git a/src/workbench/model_scripts/light_scikit_learn/requirements.txt b/src/workbench/model_scripts/light_scikit_learn/requirements.txt
index 2a1bb2a2a..cf1b0394e 100644
--- a/src/workbench/model_scripts/light_scikit_learn/requirements.txt
+++ b/src/workbench/model_scripts/light_scikit_learn/requirements.txt
@@ -1,3 +1 @@
-scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework
-umap-learn
-awswrangler>=3.8.0
\ No newline at end of file
+umap-learn
\ No newline at end of file

From 54e5eb3cda89e5d609d3f8773877737411544298 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sat, 1 Mar 2025 15:50:00 -0700
Subject: [PATCH 20/35] making the serve script executable

---
 model_docker_images/inference/serve | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 model_docker_images/inference/serve

diff --git a/model_docker_images/inference/serve b/model_docker_images/inference/serve
old mode 100644
new mode 100755

From c2f8a37f10083864dd9ec0787d89c77efe71095e Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 08:40:42 -0700
Subject: [PATCH 21/35] refactoring the training and inference containers

---
 model_docker_images/inference/main.py         | 189 ++++++++++--------
 model_docker_images/tests/test_inference.py   |  41 +++-
 .../training/sagemaker_entrypoint.py          |  58 +++---
 3 files changed, 171 insertions(+), 117 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index 1cb104f9c..5e74ad277 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -1,124 +1,145 @@
 from fastapi import FastAPI, Request, Response
 from contextlib import asynccontextmanager
 import os
+import sys
 import json
-import pandas as pd
-from io import StringIO
-import joblib
+import importlib.util
 import logging
 
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-# Global variables for model and metadata
+# Global variables
 model = None
-model_metadata = None
+inference_module = None
+
+
+def get_inference_script():
+    """Retrieve the entry point script name for SageMaker inference."""
+    # Check SAGEMAKER_PROGRAM first
+    if "SAGEMAKER_PROGRAM" in os.environ:
+        return os.environ["SAGEMAKER_PROGRAM"]
+
+    # For inference containers, check these common locations
+    model_server_config = "/opt/ml/model/model-config.json"
+    if os.path.exists(model_server_config):
+        try:
+            with open(model_server_config, "r") as f:
+                config = json.load(f)
+                if "inference_script" in config:
+                    return config["inference_script"]
+        except Exception as e:
+            print(f"Error reading model-config.json: {e}")
+
+    # Debug available environment variables
+    print("Available environment variables:")
+    for key in os.environ:
+        print(f"  {key}: {os.environ[key]}")
+
+    # Recursively list out all files in /opt/ml
+    print("Contents of /opt/ml:")
+    for root, dirs, files in os.walk("/opt/ml"):
+        for file in files:
+            print(f"  {root}/{file}")
+
+
+def get_model_script():
+    """Retrieve the SAGEMAKER_PROGRAM from environment variable or hyperparameters.json."""
+    if "SAGEMAKER_PROGRAM" in os.environ:
+        return os.environ["SAGEMAKER_PROGRAM"]
+
+    # Look for hyperparameters.json
+    hyperparams_path = "/opt/ml/input/config/hyperparameters.json"
+    if os.path.exists(hyperparams_path):
+        try:
+            with open(hyperparams_path, "r") as f:
+                hyperparams = json.load(f)
+                if "sagemaker_program" in hyperparams:
+                    return hyperparams["sagemaker_program"]
+        except Exception as e:
+            print(f"Error reading hyperparameters.json: {e}")
+
+    # If no program is found, raise an error
+    raise ValueError("SAGEMAKER_PROGRAM not found in environment variables or hyperparameters.json")
 
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Handle model loading on startup and cleanup on shutdown."""
-    global model, model_metadata
-    model_path = os.environ.get('MODEL_PATH', '/opt/ml/model')
-    model_file = os.path.join(model_path, 'model.joblib')
+    global model, inference_module
+
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    code_dir = os.environ.get("SM_MODULE_DIR", "/opt/ml/code")
+
+    # Add code_dir to sys.path so that any local utilities can be imported
+    if code_dir not in sys.path:
+        sys.path.insert(0, code_dir)
+    model_script = get_inference_script()
 
     try:
-        logger.info(f"Loading model from {model_path}")
-
-        # Check if model file exists
-        if os.path.exists(model_file):
-            model = joblib.load(model_file)
-            logger.info(f"Model loaded successfully: {type(model)}")
-        else:
-            # Log the error and available files
-            logger.error(f"Model file not found at {model_file}")
-            if os.path.exists(model_path):
-                logger.error(f"Contents of {model_path}: {os.listdir(model_path)}")
-            else:
-                logger.error(f"Model directory {model_path} does not exist")
-
-            # Fail fast - no fallback for production
-            raise FileNotFoundError(f"Required model file not found: {model_file}")
-
-        # Load metadata if available
-        metadata_file = os.path.join(model_path, 'metadata.json')
-        if os.path.exists(metadata_file):
-            with open(metadata_file, 'r') as f:
-                model_metadata = json.load(f)
-            logger.info(f"Loaded model metadata")
-        else:
-            logger.info(f"No metadata found, using default")
-            model_metadata = {'feature_names': None}
+        logger.info(f"Loading model from {model_dir}")
+        logger.info(f"Loading inference code from {code_dir}")
+
+        # Ensure directories exist
+        if not os.path.exists(model_dir):
+            raise FileNotFoundError(f"Model directory not found: {model_dir}")
+        if not os.path.exists(code_dir):
+            raise FileNotFoundError(f"Code directory not found: {code_dir}")
+
+        # List directory contents for debugging
+        logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
+        logger.info(f"Contents of {code_dir}: {os.listdir(code_dir)}")
+
+        # Load the inference module from source_dir
+        entry_point_path = os.path.join(code_dir, model_script)
+        if not os.path.exists(entry_point_path):
+            raise FileNotFoundError(f"Entry point script {model_script} not found in {code_dir}")
+
+        logger.info(f"Importing inference module from {entry_point_path}")
+        spec = importlib.util.spec_from_file_location("inference_module", entry_point_path)
+        inference_module = importlib.util.module_from_spec(spec)
+        sys.modules["inference_module"] = inference_module
+        spec.loader.exec_module(inference_module)
+
+        if not hasattr(inference_module, "model_fn"):
+            raise ImportError(f"Inference module {model_script} does not define model_fn")
+
+        # Load the model using model_fn
+        logger.info("Calling model_fn to load the model")
+        model = inference_module.model_fn(model_dir)
+        logger.info(f"Model loaded successfully: {type(model)}")
 
     except Exception as e:
-        logger.error(f"Error loading model: {e}", exc_info=True)
-        # In production, we don't want to create fallback models
-        # Let the container fail to start
+        logger.error(f"Error initializing model: {e}", exc_info=True)
         raise
 
-    logger.info("Model initialization complete")
     yield
+
     logger.info("Shutting down model server")
 
 
 app = FastAPI(lifespan=lifespan)
 
 
-@app.get('/ping')
+@app.get("/ping")
 def ping():
     """Health check endpoint for SageMaker."""
-    if model is not None:
-        return Response(status_code=200)
-    return Response(status_code=404)
+    return Response(status_code=200 if model else 404)
 
 
-@app.post('/invocations')
+@app.post("/invocations")
 async def invoke(request: Request):
     """Inference endpoint for SageMaker."""
-    content_type = request.headers.get('Content-Type', '')
-    accept_type = request.headers.get('Accept', '')
+    content_type = request.headers.get("Content-Type", "")
+    accept_type = request.headers.get("Accept", "")
 
     try:
-        # Get request body
         body = await request.body()
-
-        # Parse input data based on content type
-        if 'text/csv' in content_type:
-            s = body.decode('utf-8')
-            data = pd.read_csv(StringIO(s), header=None)
-        else:  # Default to JSON
-            json_str = body.decode('utf-8')
-            data_json = json.loads(json_str)
-
-            # Handle different JSON formats
-            if isinstance(data_json, dict) and "instances" in data_json:
-                # Format: {"instances": [[1,2,3], [4,5,6]]}
-                data = pd.DataFrame(data_json["instances"])
-            elif isinstance(data_json, list) and all(isinstance(item, list) for item in data_json):
-                # Format: [[1,2,3], [4,5,6]]
-                data = pd.DataFrame(data_json)
-            else:
-                # Try to convert to DataFrame
-                data = pd.DataFrame(data_json)
-
-        # Make prediction
-        predictions = model.predict(data)
-
-        # Format response based on accept type
-        if 'text/csv' in accept_type:
-            result = pd.DataFrame(predictions).to_csv(header=False, index=False)
-            return Response(content=result, media_type='text/csv')
-        else:  # Default to JSON
-            result = json.dumps({
-                'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions)
-            })
-            return Response(content=result, media_type='application/json')
-
+        data = inference_module.input_fn(body, content_type)
+        result = inference_module.predict_fn(data, model)
+        output_data, output_content_type = inference_module.output_fn(result, accept_type)
+        return Response(content=output_data, media_type=output_content_type)
     except Exception as e:
         logger.error(f"Error during inference: {e}", exc_info=True)
-        return Response(
-            content=json.dumps({"error": str(e)}),
-            status_code=500,
-            media_type="application/json"
-        )
+        return Response(content=json.dumps({"error": str(e)}), status_code=500, media_type="application/json")
diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py
index 30262b4e2..fdef54fd9 100644
--- a/model_docker_images/tests/test_inference.py
+++ b/model_docker_images/tests/test_inference.py
@@ -101,12 +101,22 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non
 
         # Add the image URI
         cmd.append(self.image_uri)
-
         print(f"Starting inference container: {' '.join(cmd)}")
         self.container_id = subprocess.check_output(cmd).decode('utf-8').strip()
 
-        print(f"Waiting for container to initialize...")
-        time.sleep(5)  # Give it time to start
+        # Add this block immediately after starting the container
+        print(f"Container ID: {self.container_id}")
+        try:
+            # Give it a moment to start or fail
+            time.sleep(1)
+
+            # Get container logs
+            logs = subprocess.check_output(
+                ["docker", "logs", self.container_id], stderr=subprocess.STDOUT
+            ).decode('utf-8')
+            print(f"Container startup logs:\n{logs}")
+        except Exception as e:
+            print(f"Error getting container logs: {e}")
 
         self.endpoint_url = 'http://localhost:8080'
         return MockEndpoint(self)
@@ -120,6 +130,25 @@ def __init__(self, model):
         self.model = model
         self.url = model.endpoint_url
 
+        # Check container status and logs
+        try:
+            # Get container state
+            inspect_output = subprocess.check_output(
+                ["docker", "inspect", "--format", "{{.State.Status}}", model.container_id]
+            ).decode('utf-8').strip()
+
+            print(f"Container status: {inspect_output}")
+
+            # If not running, get the logs
+            if inspect_output != "running":
+                logs = subprocess.check_output(
+                    ["docker", "logs", model.container_id], stderr=subprocess.STDOUT
+                ).decode('utf-8')
+                print(f"Container logs:\n{logs}")
+                raise RuntimeError("Container failed to start properly")
+        except Exception as e:
+            print(f"Error checking container: {e}")
+
     def predict(self, data, initial_args=None):
         """
         Makes a prediction using the deployed model.
@@ -179,8 +208,10 @@ def delete_endpoint(self):
         """Clean up resources by stopping the container"""
         print(f"Deleting endpoint (stopping container {self.model.container_id})")
         if self.model.container_id:
-            subprocess.run(["docker", "stop", self.model.container_id], check=True)
-            self.model.container_id = None
+            try:
+                subprocess.run(["docker", "stop", self.model.container_id], check=False)
+            except Exception as e:
+                print(f"Error stopping container: {e}")
 
         # Clean up temp directory if needed
         if self.model.temp_dir and os.path.exists(self.model.temp_dir):
diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index 807a82ee3..21bd7f919 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -9,11 +9,8 @@
 from urllib.parse import urlparse
 
 # Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
 logger = logging.getLogger('sagemaker-entry-point')
+logger.setLevel(logging.INFO)
 
 
 def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"):
@@ -31,8 +28,7 @@ def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"):
 
         os.makedirs(target_dir, exist_ok=True)
         with tarfile.open(local_tar, "r:gz") as tar:
-            tar.extractall(path=target_dir)
-
+            tar.extractall(path=target_dir, numeric_owner=True)
         return target_dir
     except Exception as e:
         logger.error(f"Error downloading from S3: {e}")
@@ -74,7 +70,18 @@ def setup_environment():
 
 
 def main():
-    logger.info("Starting SageMaker container entry point")
+    logger.info("Starting Workbench training container...")
+
+    # Debug available environment variables
+    logger.info("Available environment variables:")
+    for key in os.environ:
+        logger.info(f"  {key}: {os.environ[key]}")
+
+    # Recursively list out all files in /opt/ml
+    logger.info("Contents of /opt/ml:")
+    for root, dirs, files in os.walk("/opt/ml"):
+        for file in files:
+            logger.info(f"  {root}/{file}")
 
     # Load hyperparameters
     hyperparams_path = '/opt/ml/input/config/hyperparameters.json'
@@ -84,46 +91,41 @@ def main():
 
     with open(hyperparams_path, 'r') as f:
         hyperparams = json.load(f)
+    logger.info(f"Hyperparameters: {hyperparams}")
 
-    # Get program name from hyperparameters or environment
+    # Get program name from hyperparameters
     if 'sagemaker_program' in hyperparams:
-        program = hyperparams['sagemaker_program'].strip('"\'')
-        os.environ['SAGEMAKER_PROGRAM'] = program
-    elif 'SAGEMAKER_PROGRAM' in os.environ:
-        program = os.environ['SAGEMAKER_PROGRAM']
+        training_script = hyperparams['sagemaker_program'].strip('"\'')
     else:
-        logger.error("sagemaker_program not found in hyperparameters or environment!")
+        logger.error("sagemaker_program not found in hyperparameters!")
         sys.exit(1)
 
-    logger.info(f"Using program: {program}")
+    logger.info(f"Using training_script: {training_script}")
 
-    # Get source directory
-    submit_dir = "/opt/ml/code"
+    # Get source directory from hyperparameters
     if 'sagemaker_submit_directory' in hyperparams:
-        submit_dir_value = hyperparams['sagemaker_submit_directory'].strip('"\'')
+        code_directory = hyperparams['sagemaker_submit_directory'].strip('"\'')
 
         # Handle S3 vs local path
-        if submit_dir_value.startswith('s3://'):
-            submit_dir = download_and_extract_s3(submit_dir_value)
-        else:
-            submit_dir = submit_dir_value
-            if not os.path.exists(submit_dir):
-                logger.error(f"Local directory not found: {submit_dir}")
-                sys.exit(1)
+        if code_directory.startswith('s3://'):
+            code_directory = download_and_extract_s3(code_directory)
+        elif not os.path.exists(code_directory):
+            logger.error(f"Local code directory not found: {code_directory}")
+            sys.exit(1)
 
     # Install requirements if present
-    install_requirements(os.path.join(submit_dir, "requirements.txt"))
+    install_requirements(os.path.join(code_directory, "requirements.txt"))
 
     # Set up environment variables
     setup_environment()
 
-    # Find entry point script
-    entry_point = os.path.join(submit_dir, program)
+    # Find training script (entry point)
+    entry_point = os.path.join(code_directory, training_script)
     if not os.path.exists(entry_point):
         logger.error(f"Entry point not found: {entry_point}")
         sys.exit(1)
 
-    logger.info(f"Executing: {program}")
+    logger.info(f"Executing: {entry_point}")
 
     # Execute the training script with SageMaker arguments
     cmd = [

From 5f230e38e0d3c6604068771d57d48f49ab10dad4 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 10:22:04 -0700
Subject: [PATCH 22/35] simplifying the inference entry point

---
 model_docker_images/inference/main.py | 106 ++++++++------------------
 1 file changed, 33 insertions(+), 73 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index 5e74ad277..e40ab5a36 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -15,53 +15,21 @@
 inference_module = None
 
 
-def get_inference_script():
-    """Retrieve the entry point script name for SageMaker inference."""
-    # Check SAGEMAKER_PROGRAM first
-    if "SAGEMAKER_PROGRAM" in os.environ:
-        return os.environ["SAGEMAKER_PROGRAM"]
-
-    # For inference containers, check these common locations
-    model_server_config = "/opt/ml/model/model-config.json"
-    if os.path.exists(model_server_config):
-        try:
-            with open(model_server_config, "r") as f:
-                config = json.load(f)
-                if "inference_script" in config:
-                    return config["inference_script"]
-        except Exception as e:
-            print(f"Error reading model-config.json: {e}")
-
-    # Debug available environment variables
-    print("Available environment variables:")
-    for key in os.environ:
-        print(f"  {key}: {os.environ[key]}")
-
-    # Recursively list out all files in /opt/ml
-    print("Contents of /opt/ml:")
-    for root, dirs, files in os.walk("/opt/ml"):
-        for file in files:
-            print(f"  {root}/{file}")
-
-
-def get_model_script():
-    """Retrieve the SAGEMAKER_PROGRAM from environment variable or hyperparameters.json."""
-    if "SAGEMAKER_PROGRAM" in os.environ:
-        return os.environ["SAGEMAKER_PROGRAM"]
-
-    # Look for hyperparameters.json
-    hyperparams_path = "/opt/ml/input/config/hyperparameters.json"
-    if os.path.exists(hyperparams_path):
-        try:
-            with open(hyperparams_path, "r") as f:
-                hyperparams = json.load(f)
-                if "sagemaker_program" in hyperparams:
-                    return hyperparams["sagemaker_program"]
-        except Exception as e:
-            print(f"Error reading hyperparameters.json: {e}")
-
-    # If no program is found, raise an error
-    raise ValueError("SAGEMAKER_PROGRAM not found in environment variables or hyperparameters.json")
+def get_inference_script(model_dir: str) -> str:
+    """Retrieve the inference script name
+
+    Args:
+        model_dir (str): The directory containing the model artifacts
+
+    Returns:
+        str: The name of the inference script
+    """
+
+    # Get the path to the inference-metadata.json file
+    inference_meta_path = os.path.join(model_dir, "inference-metadata.json")
+    with open(inference_meta_path, "r") as f:
+        config = json.load(f)
+        return config["inference_script"]
 
 
 @asynccontextmanager
@@ -69,41 +37,33 @@ async def lifespan(app: FastAPI):
     """Handle model loading on startup and cleanup on shutdown."""
     global model, inference_module
 
+    # Note: SageMaker will put model.tar.gz in /opt/ml/model
+    #       which includes the model artifacts and inference code
     model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
-    code_dir = os.environ.get("SM_MODULE_DIR", "/opt/ml/code")
+    inference_script = get_inference_script(model_dir)
 
-    # Add code_dir to sys.path so that any local utilities can be imported
-    if code_dir not in sys.path:
-        sys.path.insert(0, code_dir)
-    model_script = get_inference_script()
+    # List directory contents for debugging
+    logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
 
     try:
-        logger.info(f"Loading model from {model_dir}")
-        logger.info(f"Loading inference code from {code_dir}")
-
-        # Ensure directories exist
-        if not os.path.exists(model_dir):
-            raise FileNotFoundError(f"Model directory not found: {model_dir}")
-        if not os.path.exists(code_dir):
-            raise FileNotFoundError(f"Code directory not found: {code_dir}")
-
-        # List directory contents for debugging
-        logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
-        logger.info(f"Contents of {code_dir}: {os.listdir(code_dir)}")
-
-        # Load the inference module from source_dir
-        entry_point_path = os.path.join(code_dir, model_script)
-        if not os.path.exists(entry_point_path):
-            raise FileNotFoundError(f"Entry point script {model_script} not found in {code_dir}")
-
-        logger.info(f"Importing inference module from {entry_point_path}")
-        spec = importlib.util.spec_from_file_location("inference_module", entry_point_path)
+        # Load the inference script from source_dir
+        inference_script_path = os.path.join(model_dir, inference_script)
+        if not os.path.exists(inference_script_path):
+            raise FileNotFoundError(f"Inference script not found: {inference_script_path}")
+
+        # Add the code directory to the Python path
+        os.environ["PYTHONPATH"] = f"{model_dir}:{os.environ.get('PYTHONPATH', '')}"
+
+        # Import the inference module
+        logger.info(f"Importing inference module from {inference_script_path}")
+        spec = importlib.util.spec_from_file_location("inference_module", inference_script_path)
         inference_module = importlib.util.module_from_spec(spec)
         sys.modules["inference_module"] = inference_module
         spec.loader.exec_module(inference_module)
 
+        # Check if model_fn is defined
         if not hasattr(inference_module, "model_fn"):
-            raise ImportError(f"Inference module {model_script} does not define model_fn")
+            raise ImportError(f"Inference module {inference_script_path} does not define model_fn")
 
         # Load the model using model_fn
         logger.info("Calling model_fn to load the model")

From cf008ecbafa694a207dff0da264e6d48af30dc4c Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 10:23:37 -0700
Subject: [PATCH 23/35] adding code and metadata to model dir (for pick up by
 inference container)

---
 .../training/sagemaker_entrypoint.py          | 85 +++++++++----------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index 21bd7f919..19371b01b 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os
 import sys
+import shutil
 import json
 import tarfile
 import subprocess
@@ -9,8 +10,8 @@
 from urllib.parse import urlparse
 
 # Set up logging
-logger = logging.getLogger('sagemaker-entry-point')
-logger.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
 def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"):
@@ -51,38 +52,26 @@ def install_requirements(requirements_path):
         logger.info(f"No requirements file found at {requirements_path}")
 
 
-def setup_environment():
-    """Set up SageMaker environment variables."""
-    env_vars = {
-        "SM_MODEL_DIR": "/opt/ml/model",
-        "SM_OUTPUT_DATA_DIR": "/opt/ml/output/data",
-        "SM_CHANNEL_TRAIN": "/opt/ml/input/data/train",
-        "SM_OUTPUT_DIR": "/opt/ml/output",
-        "SM_INPUT_DIR": "/opt/ml/input",
-        "SM_INPUT_CONFIG_DIR": "/opt/ml/input/config"
-    }
+def include_code_and_meta_for_inference(model_dir, code_dir, entry_point):
+    """Include code and some metadata for the inference container"""
+    logger.info("Including code and metadata for inference...")
 
-    for key, value in env_vars.items():
-        os.environ[key] = str(value)
-        os.makedirs(value, exist_ok=True)
+    # Create inference metadata file
+    inference_metadata = {"inference_script": entry_point}
 
-    logger.info(f"SageMaker environment initialized.")
+    # Write metadata to model directory
+    metadata_path = os.path.join(model_dir, "inference-metadata.json")
+    with open(metadata_path, "w") as fp:
+        json.dump(inference_metadata, fp)
+
+    # Copy code to model directory
+    for file in os.listdir(code_dir):
+        shutil.copy2(os.path.join(code_dir, file), model_dir)
 
 
 def main():
     logger.info("Starting Workbench training container...")
 
-    # Debug available environment variables
-    logger.info("Available environment variables:")
-    for key in os.environ:
-        logger.info(f"  {key}: {os.environ[key]}")
-
-    # Recursively list out all files in /opt/ml
-    logger.info("Contents of /opt/ml:")
-    for root, dirs, files in os.walk("/opt/ml"):
-        for file in files:
-            logger.info(f"  {root}/{file}")
-
     # Load hyperparameters
     hyperparams_path = '/opt/ml/input/config/hyperparameters.json'
     if not os.path.exists(hyperparams_path):
@@ -116,29 +105,35 @@ def main():
     # Install requirements if present
     install_requirements(os.path.join(code_directory, "requirements.txt"))
 
-    # Set up environment variables
-    setup_environment()
-
-    # Find training script (entry point)
-    entry_point = os.path.join(code_directory, training_script)
-    if not os.path.exists(entry_point):
-        logger.error(f"Entry point not found: {entry_point}")
+    # Find training script
+    training_script_path = os.path.join(code_directory, training_script)
+    if not os.path.exists(training_script_path):
+        logger.error(f"Training script not found: {training_script_path}")
         sys.exit(1)
 
-    logger.info(f"Executing: {entry_point}")
+    logger.info(f"Executing: {training_script_path}")
 
-    # Execute the training script with SageMaker arguments
-    cmd = [
-        sys.executable, entry_point,
-        "--model-dir", os.environ["SM_MODEL_DIR"],
-        "--output-data-dir", os.environ["SM_OUTPUT_DATA_DIR"],
-        "--train", os.environ["SM_CHANNEL_TRAIN"]
-    ]
+    # Add the code directory to the Python path
+    os.environ["PYTHONPATH"] = f"{code_directory}:{os.environ.get('PYTHONPATH', '')}"
 
+    # Call the training script and then include code and meta for inference
     try:
-        os.execv(sys.executable, cmd)
-    except Exception as e:
-        logger.error(f"Failed to execute entry point: {e}")
+        subprocess.check_call([
+            sys.executable, training_script_path,
+            "--model-dir", os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
+            "--output-data-dir", os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"),
+            "--train", os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"),
+        ])
+
+        # After training completes, include code and meta in the model.tar.gz
+        include_code_and_meta_for_inference(
+            model_dir=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
+            code_dir=code_directory,
+            entry_point=training_script
+        )
+
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to execute training script: {e}")
         sys.exit(1)
 
 

From fefca7805fecf7421c5240fd989d89f0064eee74 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 10:24:41 -0700
Subject: [PATCH 24/35] changing script args so they don't fail if ENV vars
 aren't set

---
 .../custom_models/chem_info/molecular_descriptors.py   |  9 ++++++---
 .../custom_models/chem_info/morgan_fingerprints.py     |  9 ++++++---
 .../custom_models/chem_info/tautomerize.py             |  9 ++++++---
 .../custom_script_example/custom_model_script.py       |  9 ++++++---
 .../light_quant_regression/quant_regression.template   |  8 ++++----
 .../light_scikit_learn/scikit_learn.template           | 10 ++++++----
 .../model_scripts/light_xgb_model/xgb_model.template   |  9 +++++----
 7 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py
index c71e81934..8a6c248b5 100644
--- a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py
+++ b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py
@@ -22,10 +22,13 @@
 # and save the model artifacts to the model directory.
 #
 if __name__ == "__main__":
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
+    )
     args = parser.parse_args()
 
     # This model doesn't get trained, it just a feature creation 'model'
diff --git a/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py b/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py
index 4fede9442..a3889715a 100644
--- a/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py
+++ b/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py
@@ -24,10 +24,13 @@
 # and save the model artifacts to the model directory.
 #
 if __name__ == "__main__":
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
+    )
     args = parser.parse_args()
 
     # This model doesn't get trained, it just a feature creation 'model'
diff --git a/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py b/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py
index 72c2afe34..16e479a61 100644
--- a/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py
+++ b/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py
@@ -23,10 +23,13 @@
 # This section (__main__) is where SageMaker will execute the job and save the model artifacts.
 #
 if __name__ == "__main__":
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
+    )
     args = parser.parse_args()
 
     # This model doesn't get trained; it's a feature processing 'model'
diff --git a/src/workbench/model_scripts/custom_script_example/custom_model_script.py b/src/workbench/model_scripts/custom_script_example/custom_model_script.py
index 3e2a8db0a..e6492ba3d 100644
--- a/src/workbench/model_scripts/custom_script_example/custom_model_script.py
+++ b/src/workbench/model_scripts/custom_script_example/custom_model_script.py
@@ -48,10 +48,13 @@ def expand_proba_column(df: pd.DataFrame, class_labels: list) -> pd.DataFrame:
 # and save the model artifacts to the model directory.
 #
 if __name__ == "__main__":
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
+    )
     args = parser.parse_args()
 
     # Load the training data
diff --git a/src/workbench/model_scripts/light_quant_regression/quant_regression.template b/src/workbench/model_scripts/light_quant_regression/quant_regression.template
index 8ea2a6e6d..109ca190a 100644
--- a/src/workbench/model_scripts/light_quant_regression/quant_regression.template
+++ b/src/workbench/model_scripts/light_quant_regression/quant_regression.template
@@ -86,13 +86,13 @@ if __name__ == "__main__":
     quantiles = [0.05, 0.25, 0.50, 0.75, 0.95]
     q_models = {}
 
-    # Sagemaker specific arguments. Defaults are set in the environment variables.
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
     parser.add_argument(
-        "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
     )
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
     args = parser.parse_args()
 
     # Read the training data into DataFrames
diff --git a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template
index e98f752ce..f0deaf1d4 100644
--- a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template
+++ b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template
@@ -89,11 +89,13 @@ if __name__ == "__main__":
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
     validation_split = 0.2
 
-    # SageMaker arguments for input/output directories
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
+    parser.add_argument(
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
+    )
     args = parser.parse_args()
 
     # Load training data from the specified directory
diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template
index 9c53a4d90..e0a7fc9c0 100644
--- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template
+++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template
@@ -131,15 +131,16 @@ if __name__ == "__main__":
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
     validation_split = 0.2
 
-    # Sagemaker specific arguments. Defaults are set in the environment variables.
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
     parser.add_argument(
-        "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
     )
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
     args = parser.parse_args()
 
+
     # Read the training data into DataFrames
     training_files = [
         os.path.join(args.train, file)

From 951511ba4dd58c684e5ddb92bd81d59c9c7a8ec7 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 10:41:15 -0700
Subject: [PATCH 25/35] changing script args so they don't fail if ENV vars
 aren't set

---
 model_docker_images/tests/example_model_script.py      | 10 +++++-----
 .../custom_script_example/custom_model_script.py       |  2 +-
 .../light_quant_regression/quant_regression.template   |  2 +-
 .../light_scikit_learn/scikit_learn.template           |  2 +-
 .../model_scripts/light_xgb_model/xgb_model.template   |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py
index bb736ac7c..6a2569c16 100644
--- a/model_docker_images/tests/example_model_script.py
+++ b/model_docker_images/tests/example_model_script.py
@@ -131,13 +131,13 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     train_all_data = TEMPLATE_PARAMS["train_all_data"]
     validation_split = 0.2
 
-    # Sagemaker specific arguments. Defaults are set in the environment variables.
+    # Script arguments for input/output directories
     parser = argparse.ArgumentParser()
+    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"))
     parser.add_argument(
-        "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]
+        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data")
     )
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
     args = parser.parse_args()
 
     # Read the training data into DataFrames
@@ -342,7 +342,7 @@ def predict_fn(df, model) -> pd.DataFrame:
     """
 
     # Grab our feature columns (from training)
-    model_dir = os.environ["SM_MODEL_DIR"]
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
         model_features = json.load(fp)
     print(f"Model Features: {model_features}")
diff --git a/src/workbench/model_scripts/custom_script_example/custom_model_script.py b/src/workbench/model_scripts/custom_script_example/custom_model_script.py
index e6492ba3d..c36d4ff15 100644
--- a/src/workbench/model_scripts/custom_script_example/custom_model_script.py
+++ b/src/workbench/model_scripts/custom_script_example/custom_model_script.py
@@ -147,7 +147,7 @@ def output_fn(output_df, accept_type):
 
 # Prediction function
 def predict_fn(df, model):
-    model_dir = os.environ["SM_MODEL_DIR"]
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
         model_features = json.load(fp)
 
diff --git a/src/workbench/model_scripts/light_quant_regression/quant_regression.template b/src/workbench/model_scripts/light_quant_regression/quant_regression.template
index 109ca190a..f638c5f75 100644
--- a/src/workbench/model_scripts/light_quant_regression/quant_regression.template
+++ b/src/workbench/model_scripts/light_quant_regression/quant_regression.template
@@ -280,7 +280,7 @@ def predict_fn(df, models) -> pd.DataFrame:
     """
 
     # Grab our feature columns (from training)
-    model_dir = os.environ["SM_MODEL_DIR"]
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
         model_features = json.load(fp)
     print(f"Model Features: {model_features}")
diff --git a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template
index f0deaf1d4..f79565947 100644
--- a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template
+++ b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template
@@ -244,7 +244,7 @@ def output_fn(output_df, accept_type):
 
 def predict_fn(df, model):
     """Make predictions or apply transformations using the model and return the DataFrame with results."""
-    model_dir = os.environ["SM_MODEL_DIR"]
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
 
     # Load feature columns from the saved file
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template
index e0a7fc9c0..f02fca231 100644
--- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template
+++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template
@@ -340,7 +340,7 @@ def predict_fn(df, model) -> pd.DataFrame:
     """
 
     # Grab our feature columns (from training)
-    model_dir = os.environ["SM_MODEL_DIR"]
+    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
     with open(os.path.join(model_dir, "feature_columns.json")) as fp:
         model_features = json.load(fp)
     print(f"Model Features: {model_features}")

From e56fe74ef5a157c2d7b2a6469749c79a7c26752e Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 11:05:49 -0700
Subject: [PATCH 26/35] changing logic for copying code files/directories

---
 model_docker_images/training/sagemaker_entrypoint.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index 19371b01b..2fa251bbc 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -64,9 +64,10 @@ def include_code_and_meta_for_inference(model_dir, code_dir, entry_point):
     with open(metadata_path, "w") as fp:
         json.dump(inference_metadata, fp)
 
-    # Copy code to model directory
-    for file in os.listdir(code_dir):
-        shutil.copy2(os.path.join(code_dir, file), model_dir)
+    # Copy code to model directory, copy ALL files and directories recursively (except __pycache__)
+    for item in os.listdir(code_dir):
+        if item != "__pycache__":
+            shutil.copytree(os.path.join(code_dir, item), os.path.join(model_dir, item))
 
 
 def main():

From f4242221a4f745e8f8f68de79c2593ebcba2ad5a Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 11:27:20 -0700
Subject: [PATCH 27/35] PYTHONPATH doesn't work with importlib, so use sys.path

---
 model_docker_images/inference/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index e40ab5a36..849d79fd1 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -51,8 +51,8 @@ async def lifespan(app: FastAPI):
         if not os.path.exists(inference_script_path):
             raise FileNotFoundError(f"Inference script not found: {inference_script_path}")
 
-        # Add the code directory to the Python path
-        os.environ["PYTHONPATH"] = f"{model_dir}:{os.environ.get('PYTHONPATH', '')}"
+        # Ensure the model directory is in the Python path
+        sys.path.insert(0, model_dir)
 
         # Import the inference module
         logger.info(f"Importing inference module from {inference_script_path}")

From 56a59ae9371afbe23b64d835b2d0dad0c03bdebd Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 11:28:51 -0700
Subject: [PATCH 28/35] fixing the file/dir copy from code to model dir

---
 model_docker_images/training/sagemaker_entrypoint.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index 2fa251bbc..bca57544f 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -65,9 +65,13 @@ def include_code_and_meta_for_inference(model_dir, code_dir, entry_point):
         json.dump(inference_metadata, fp)
 
     # Copy code to model directory, copy ALL files and directories recursively (except __pycache__)
+    # Also list all files/directories that are being copied
     for item in os.listdir(code_dir):
-        if item != "__pycache__":
-            shutil.copytree(os.path.join(code_dir, item), os.path.join(model_dir, item))
+        if item == "__pycache__":
+            continue
+        src, dst = os.path.join(code_dir, item), os.path.join(model_dir, item)
+        shutil.copytree(src, dst, dirs_exist_ok=True) if os.path.isdir(src) else shutil.copy2(src, dst)
+        logger.info(f"Copied: {src} -> {dst}")
 
 
 def main():

From 8b7af3351aa68e3cb02782b0cee89e08517b8b7f Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 11:38:53 -0700
Subject: [PATCH 29/35] flake8/linter cleanup

---
 .../tests/example_model_script.py             | 31 +++---
 model_docker_images/tests/test_inference.py   | 95 ++++++++-----------
 model_docker_images/tests/test_training.py    | 53 ++++++-----
 .../training/sagemaker_entrypoint.py          | 40 ++++----
 src/workbench/core/artifacts/model_core.py    |  2 -
 5 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py
index 6a2569c16..11a1d0767 100644
--- a/model_docker_images/tests/example_model_script.py
+++ b/model_docker_images/tests/example_model_script.py
@@ -2,9 +2,18 @@
 TEMPLATE_PARAMS = {
     "model_type": "regressor",
     "target_column": "class_number_of_rings",
-    "feature_list": ['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'auto_id'],
+    "feature_list": [
+        "length",
+        "diameter",
+        "height",
+        "whole_weight",
+        "shucked_weight",
+        "viscera_weight",
+        "shell_weight",
+        "auto_id",
+    ],
     "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/training/abalone-regression",
-    "train_all_data": False
+    "train_all_data": False,
 }
 
 # Imports for XGB Model
@@ -141,11 +150,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     args = parser.parse_args()
 
     # Read the training data into DataFrames
-    training_files = [
-        os.path.join(args.train, file)
-        for file in os.listdir(args.train)
-        if file.endswith(".csv")
-    ]
+    training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")]
     print(f"Training Files: {training_files}")
 
     # Combine files and read them all into a single pandas dataframe
@@ -172,9 +177,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
     else:
         # Just do a random training Split
         print("WARNING: No training column found, splitting data with random state=42")
-        df_train, df_val = train_test_split(
-            all_df, test_size=validation_split, random_state=42
-        )
+        df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42)
     print(f"FIT/TRAIN: {df_train.shape}")
     print(f"VALIDATION: {df_val.shape}")
 
@@ -233,9 +236,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p
         label_names = label_encoder.classes_
 
         # Calculate various model performance metrics
-        scores = precision_recall_fscore_support(
-            df_val[target], preds, average=None, labels=label_names
-        )
+        scores = precision_recall_fscore_support(df_val[target], preds, average=None, labels=label_names)
 
         # Put the scores into a dataframe
         score_df = pd.DataFrame(
@@ -289,7 +290,9 @@ def model_fn(model_dir):
     model_path = os.path.join(model_dir, "xgb_model.json")
     with open(model_path, "r") as f:
         model_json = json.load(f)
-    saved_model_type = json.loads(model_json.get('learner').get('attributes').get('scikit_learn')).get('_estimator_type')
+    saved_model_type = json.loads(model_json.get("learner").get("attributes").get("scikit_learn")).get(
+        "_estimator_type"
+    )
     if saved_model_type == "classifier":
         model = xgb.XGBClassifier()
     elif saved_model_type == "regressor":
diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py
index fdef54fd9..8520f1b6e 100644
--- a/model_docker_images/tests/test_inference.py
+++ b/model_docker_images/tests/test_inference.py
@@ -66,20 +66,17 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non
             import xgboost as xgb
 
             # Train a simple model
-            model = xgb.XGBRegressor(objective='reg:squarederror')
+            model = xgb.XGBRegressor(objective="reg:squarederror")
             X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
             y = np.array([10, 20, 30])
             model.fit(X, y)
 
             # Save the model
-            joblib.dump(model, os.path.join(model_dir, 'model.joblib'))
+            joblib.dump(model, os.path.join(model_dir, "model.joblib"))
 
             # Save metadata
-            with open(os.path.join(model_dir, 'metadata.json'), 'w') as f:
-                json.dump({
-                    'feature_names': ['feature1', 'feature2', 'feature3'],
-                    'model_type': 'regression'
-                }, f)
+            with open(os.path.join(model_dir, "metadata.json"), "w") as f:
+                json.dump({"feature_names": ["feature1", "feature2", "feature3"], "model_type": "regression"}, f)
 
             self.model_data = model_dir
         else:
@@ -88,21 +85,27 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non
 
         # Start the container
         cmd = [
-            "docker", "run", "-d", "--rm",
-            "-p", "8080:8080",
-            "-v", f"{model_dir}:/opt/ml/model",
-            "-e", "MODEL_PATH=/opt/ml/model",
+            "docker",
+            "run",
+            "-d",
+            "--rm",
+            "-p",
+            "8080:8080",
+            "-v",
+            f"{model_dir}:/opt/ml/model",
+            "-e",
+            "MODEL_PATH=/opt/ml/model",
         ]
 
         # Add platform flag for Mac M1/M2/M3 users
-        if os.uname().machine == 'arm64':
+        if os.uname().machine == "arm64":
             cmd.insert(2, "--platform")
             cmd.insert(3, "linux/amd64")
 
         # Add the image URI
         cmd.append(self.image_uri)
         print(f"Starting inference container: {' '.join(cmd)}")
-        self.container_id = subprocess.check_output(cmd).decode('utf-8').strip()
+        self.container_id = subprocess.check_output(cmd).decode("utf-8").strip()
 
         # Add this block immediately after starting the container
         print(f"Container ID: {self.container_id}")
@@ -111,14 +114,14 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non
             time.sleep(1)
 
             # Get container logs
-            logs = subprocess.check_output(
-                ["docker", "logs", self.container_id], stderr=subprocess.STDOUT
-            ).decode('utf-8')
+            logs = subprocess.check_output(["docker", "logs", self.container_id], stderr=subprocess.STDOUT).decode(
+                "utf-8"
+            )
             print(f"Container startup logs:\n{logs}")
         except Exception as e:
             print(f"Error getting container logs: {e}")
 
-        self.endpoint_url = 'http://localhost:8080'
+        self.endpoint_url = "http://localhost:8080"
         return MockEndpoint(self)
 
 
@@ -133,17 +136,19 @@ def __init__(self, model):
         # Check container status and logs
         try:
             # Get container state
-            inspect_output = subprocess.check_output(
-                ["docker", "inspect", "--format", "{{.State.Status}}", model.container_id]
-            ).decode('utf-8').strip()
+            inspect_output = (
+                subprocess.check_output(["docker", "inspect", "--format", "{{.State.Status}}", model.container_id])
+                .decode("utf-8")
+                .strip()
+            )
 
             print(f"Container status: {inspect_output}")
 
             # If not running, get the logs
             if inspect_output != "running":
-                logs = subprocess.check_output(
-                    ["docker", "logs", model.container_id], stderr=subprocess.STDOUT
-                ).decode('utf-8')
+                logs = subprocess.check_output(["docker", "logs", model.container_id], stderr=subprocess.STDOUT).decode(
+                    "utf-8"
+                )
                 print(f"Container logs:\n{logs}")
                 raise RuntimeError("Container failed to start properly")
         except Exception as e:
@@ -161,10 +166,10 @@ def predict(self, data, initial_args=None):
             The prediction result
         """
         # Default to first registered content type
-        content_type = self.model.content_types[0] if hasattr(self.model, 'content_types') else 'application/json'
+        content_type = self.model.content_types[0] if hasattr(self.model, "content_types") else "application/json"
 
         # Format the data according to content type
-        if content_type == 'text/csv':
+        if content_type == "text/csv":
             if isinstance(data, pd.DataFrame):
                 payload = data.to_csv(header=False, index=False)
             elif isinstance(data, (list, np.ndarray)):
@@ -174,26 +179,22 @@ def predict(self, data, initial_args=None):
         else:
             # Default to JSON
             if isinstance(data, pd.DataFrame):
-                payload = data.to_json(orient='records')
+                payload = data.to_json(orient="records")
             elif isinstance(data, (list, np.ndarray)):
-                payload = json.dumps({"instances": data.tolist() if hasattr(data, 'tolist') else data})
+                payload = json.dumps({"instances": data.tolist() if hasattr(data, "tolist") else data})
             else:
                 payload = json.dumps(data)
 
         # Send the request to the container
         try:
-            response = requests.post(
-                f"{self.url}/invocations",
-                data=payload,
-                headers={"Content-Type": content_type}
-            )
+            response = requests.post(f"{self.url}/invocations", data=payload, headers={"Content-Type": content_type})
 
             # Check for errors
             if response.status_code != 200:
                 raise Exception(f"Prediction failed with status code {response.status_code}: {response.text}")
 
             # Parse response based on response type
-            if hasattr(self.model, 'response_types') and 'text/csv' in self.model.response_types:
+            if hasattr(self.model, "response_types") and "text/csv" in self.model.response_types:
                 # Parse CSV response
                 return pd.read_csv(StringIO(response.text), header=None)
             else:
@@ -226,10 +227,7 @@ def test_csv_inference(endpoint, test_data=None):
 
     if test_data is None:
         # Create sample test data
-        test_data = pd.DataFrame([
-            [1.0, 2.0, 3.0],
-            [4.0, 5.0, 6.0]
-        ])
+        test_data = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
 
     try:
         response = endpoint.predict(test_data)
@@ -247,10 +245,7 @@ def test_json_inference(endpoint, test_data=None):
 
     if test_data is None:
         # Create sample test data - use list of lists of floats
-        test_data = [
-            [1.0, 2.0, 3.0],
-            [4.0, 5.0, 6.0]
-        ]
+        test_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
 
     try:
         response = endpoint.predict(test_data)
@@ -282,7 +277,9 @@ def test_ping_endpoint(url):
 def main():
     """Run the test using MockModel and MockEndpoint"""
     parser = argparse.ArgumentParser(description="Test SageMaker inference container")
-    parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag")
+    parser.add_argument(
+        "--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag"
+    )
     parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)")
     args = parser.parse_args()
 
@@ -295,11 +292,7 @@ def main():
 
     try:
         # Create and deploy the model
-        model = MockModel(
-            image_uri=args.image,
-            model_data=args.model_dir,
-            role="mock-role"
-        )
+        model = MockModel(image_uri=args.image, model_data=args.model_dir, role="mock-role")
 
         # Register the model
         model.register(
@@ -307,15 +300,11 @@ def main():
             response_types=["text/csv", "application/json"],
             inference_instances=["ml.t2.medium"],
             transform_instances=["ml.m5.large"],
-            description="Test model"
+            description="Test model",
         )
 
         # Deploy the model
-        endpoint = model.deploy(
-            instance_type="local",
-            initial_instance_count=1,
-            endpoint_name="test-endpoint"
-        )
+        endpoint = model.deploy(instance_type="local", initial_instance_count=1, endpoint_name="test-endpoint")
 
         # Test the /ping endpoint
         ping_success = test_ping_endpoint(endpoint.url)
diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py
index 74562bf03..f6c64c3ff 100644
--- a/model_docker_images/tests/test_training.py
+++ b/model_docker_images/tests/test_training.py
@@ -30,7 +30,7 @@ def fit(self, inputs, job_name=None, logs=True):
             print(f"Created test environment at: {self.temp_dir}")
 
             # Create directories
-            for path in ['input/data/train', 'input/config', 'model', 'output/data', 'code']:
+            for path in ["input/data/train", "input/config", "model", "output/data", "code"]:
                 os.makedirs(f"{self.temp_dir}/{path}", exist_ok=True)
 
             # Copy data files
@@ -57,7 +57,7 @@ def fit(self, inputs, job_name=None, logs=True):
             all_hyperparams = {
                 **self.hyperparameters,
                 "sagemaker_program": self.entry_point,
-                "sagemaker_submit_directory": "/opt/ml/code"
+                "sagemaker_submit_directory": "/opt/ml/code",
             }
 
             with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f:
@@ -65,20 +65,30 @@ def fit(self, inputs, job_name=None, logs=True):
 
             # Run the container
             cmd = [
-                "docker", "run", "--rm",
-                "-v", f"{self.temp_dir}/input:/opt/ml/input",
-                "-v", f"{self.temp_dir}/model:/opt/ml/model",
-                "-v", f"{self.temp_dir}/output:/opt/ml/output",
-                "-v", f"{self.temp_dir}/code:/opt/ml/code",
-                "-e", f"SAGEMAKER_PROGRAM={self.entry_point}",
-                "-e", "SM_MODEL_DIR=/opt/ml/model",
-                "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data",
-                "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train",
-                self.image_uri
+                "docker",
+                "run",
+                "--rm",
+                "-v",
+                f"{self.temp_dir}/input:/opt/ml/input",
+                "-v",
+                f"{self.temp_dir}/model:/opt/ml/model",
+                "-v",
+                f"{self.temp_dir}/output:/opt/ml/output",
+                "-v",
+                f"{self.temp_dir}/code:/opt/ml/code",
+                "-e",
+                f"SAGEMAKER_PROGRAM={self.entry_point}",
+                "-e",
+                "SM_MODEL_DIR=/opt/ml/model",
+                "-e",
+                "SM_OUTPUT_DATA_DIR=/opt/ml/output/data",
+                "-e",
+                "SM_CHANNEL_TRAIN=/opt/ml/input/data/train",
+                self.image_uri,
             ]
 
             # Add platform flag for Mac M1/M2/M3 users
-            if os.uname().machine == 'arm64':
+            if os.uname().machine == "arm64":
                 cmd.insert(2, "--platform")
                 cmd.insert(3, "linux/amd64")
 
@@ -119,7 +129,9 @@ def cleanup(self):
 def main():
     """Run the test using a MockEstimator"""
     parser = argparse.ArgumentParser(description="Test SageMaker training container")
-    parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag")
+    parser.add_argument(
+        "--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag"
+    )
     parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name")
     parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts")
     parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path")
@@ -135,17 +147,10 @@ def main():
     print(f"Testing with image {args.image}, script {args.entry_point}")
 
     # Create and run the estimator
-    estimator = MockEstimator(
-        image_uri=args.image,
-        entry_point=args.entry_point,
-        source_dir=source_dir
-    )
+    estimator = MockEstimator(image_uri=args.image, entry_point=args.entry_point, source_dir=source_dir)
 
     try:
-        estimator.fit(
-            inputs={"train": data_path},
-            job_name="mock-training-job"
-        )
+        estimator.fit(inputs={"train": data_path}, job_name="mock-training-job")
         print("✅ Training completed successfully")
     except Exception as e:
         print(f"❌ Training failed: {e}")
@@ -155,4 +160,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py
index bca57544f..ee70e355b 100644
--- a/model_docker_images/training/sagemaker_entrypoint.py
+++ b/model_docker_images/training/sagemaker_entrypoint.py
@@ -41,9 +41,7 @@ def install_requirements(requirements_path):
     if os.path.exists(requirements_path):
         logger.info(f"Installing dependencies from {requirements_path}...")
         try:
-            subprocess.check_call([
-                sys.executable, "-m", "pip", "install", "-r", requirements_path
-            ])
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path])
             logger.info("Requirements installed successfully.")
         except subprocess.CalledProcessError as e:
             logger.error(f"Error installing requirements: {e}")
@@ -78,18 +76,18 @@ def main():
     logger.info("Starting Workbench training container...")
 
     # Load hyperparameters
-    hyperparams_path = '/opt/ml/input/config/hyperparameters.json'
+    hyperparams_path = "/opt/ml/input/config/hyperparameters.json"
     if not os.path.exists(hyperparams_path):
         logger.error("hyperparameters.json not found!")
         sys.exit(1)
 
-    with open(hyperparams_path, 'r') as f:
+    with open(hyperparams_path, "r") as f:
         hyperparams = json.load(f)
     logger.info(f"Hyperparameters: {hyperparams}")
 
     # Get program name from hyperparameters
-    if 'sagemaker_program' in hyperparams:
-        training_script = hyperparams['sagemaker_program'].strip('"\'')
+    if "sagemaker_program" in hyperparams:
+        training_script = hyperparams["sagemaker_program"].strip("\"'")
     else:
         logger.error("sagemaker_program not found in hyperparameters!")
         sys.exit(1)
@@ -97,11 +95,11 @@ def main():
     logger.info(f"Using training_script: {training_script}")
 
     # Get source directory from hyperparameters
-    if 'sagemaker_submit_directory' in hyperparams:
-        code_directory = hyperparams['sagemaker_submit_directory'].strip('"\'')
+    if "sagemaker_submit_directory" in hyperparams:
+        code_directory = hyperparams["sagemaker_submit_directory"].strip("\"'")
 
         # Handle S3 vs local path
-        if code_directory.startswith('s3://'):
+        if code_directory.startswith("s3://"):
             code_directory = download_and_extract_s3(code_directory)
         elif not os.path.exists(code_directory):
             logger.error(f"Local code directory not found: {code_directory}")
@@ -123,18 +121,24 @@ def main():
 
     # Call the training script and then include code and meta for inference
     try:
-        subprocess.check_call([
-            sys.executable, training_script_path,
-            "--model-dir", os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
-            "--output-data-dir", os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"),
-            "--train", os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"),
-        ])
+        subprocess.check_call(
+            [
+                sys.executable,
+                training_script_path,
+                "--model-dir",
+                os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
+                "--output-data-dir",
+                os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"),
+                "--train",
+                os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"),
+            ]
+        )
 
         # After training completes, include code and meta in the model.tar.gz
         include_code_and_meta_for_inference(
             model_dir=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"),
             code_dir=code_directory,
-            entry_point=training_script
+            entry_point=training_script,
         )
 
     except subprocess.CalledProcessError as e:
@@ -142,5 +146,5 @@ def main():
         sys.exit(1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py
index 50779831a..c29d864ba 100644
--- a/src/workbench/core/artifacts/model_core.py
+++ b/src/workbench/core/artifacts/model_core.py
@@ -51,7 +51,6 @@ class ModelImages:
         ("us-west-2", "inference", "0.1"): (
             "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1"
         ),
-
         # These are the OLD locked SKLearn images
         ("us-east-1", "sklearn", "1.2.1"): (
             "683313688378.dkr.ecr.us-east-1.amazonaws.com/"
@@ -69,7 +68,6 @@ class ModelImages:
             "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
             "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd"
         ),
-
     }
 
     @classmethod

From 8868c88165926efc76dd2ef764cac48369f5a914 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 11:57:15 -0700
Subject: [PATCH 30/35] adding install requirements.txt for inference entry
 point

---
 model_docker_images/inference/main.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index 849d79fd1..fb30829d0 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -5,6 +5,7 @@
 import json
 import importlib.util
 import logging
+import subprocess
 
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -32,6 +33,20 @@ def get_inference_script(model_dir: str) -> str:
         return config["inference_script"]
 
 
+def install_requirements(requirements_path):
+    """Install Python dependencies from requirements file."""
+    if os.path.exists(requirements_path):
+        logger.info(f"Installing dependencies from {requirements_path}...")
+        try:
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path])
+            logger.info("Requirements installed successfully.")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Error installing requirements: {e}")
+            sys.exit(1)
+    else:
+        logger.info(f"No requirements file found at {requirements_path}")
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Handle model loading on startup and cleanup on shutdown."""
@@ -51,6 +66,9 @@ async def lifespan(app: FastAPI):
         if not os.path.exists(inference_script_path):
             raise FileNotFoundError(f"Inference script not found: {inference_script_path}")
 
+        # Install requirements if present
+        install_requirements(os.path.join(model_dir, "requirements.txt"))
+
         # Ensure the model directory is in the Python path
         sys.path.insert(0, model_dir)
 

From f9b0db8e4a38d3ba631ca8c9309a616f2b321cee Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 12:34:44 -0700
Subject: [PATCH 31/35] putting in a better pip install (with cache) and better
 ping response

---
 model_docker_images/inference/main.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index fb30829d0..6949d002c 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -6,6 +6,7 @@
 import importlib.util
 import logging
 import subprocess
+import site
 
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -34,11 +35,29 @@ def get_inference_script(model_dir: str) -> str:
 
 
 def install_requirements(requirements_path):
-    """Install Python dependencies from requirements file."""
+    """Install Python dependencies from requirements file.
+       Uses a persistent cache to speed up container cold starts.
+       Note: Inference containers don't have root access, so we
+             use the --user flag and add the user package path manually.
+    """
     if os.path.exists(requirements_path):
         logger.info(f"Installing dependencies from {requirements_path}...")
+
+        # Define a persistent cache location
+        pip_cache_dir = "/opt/ml/model/.cache/pip"
+        os.environ["PIP_CACHE_DIR"] = pip_cache_dir
+
         try:
-            subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path])
+            subprocess.check_call([
+                sys.executable, "-m", "pip", "install",
+                "--cache-dir", pip_cache_dir,  # Enable caching
+                "--disable-pip-version-check",
+                "--no-warn-script-location",
+                "--user",
+                "-r", requirements_path
+            ])
+            # Ensure Python can find user-installed packages
+            sys.path.append(site.getusersitepackages())
             logger.info("Requirements installed successfully.")
         except subprocess.CalledProcessError as e:
             logger.error(f"Error installing requirements: {e}")
@@ -103,7 +122,8 @@ async def lifespan(app: FastAPI):
 @app.get("/ping")
 def ping():
     """Health check endpoint for SageMaker."""
-    return Response(status_code=200 if model else 404)
+    # Check if the inference module is loaded
+    return Response(status_code=200 if inference_module else 500)
 
 
 @app.post("/invocations")

From 9246be4f85990c05874930581669b8eff129a979 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 12:34:59 -0700
Subject: [PATCH 32/35] flake8/linter cleanup

---
 model_docker_images/inference/main.py | 29 +++++++++++++++++----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py
index 6949d002c..7cf6fe585 100644
--- a/model_docker_images/inference/main.py
+++ b/model_docker_images/inference/main.py
@@ -36,9 +36,9 @@ def get_inference_script(model_dir: str) -> str:
 
 def install_requirements(requirements_path):
     """Install Python dependencies from requirements file.
-       Uses a persistent cache to speed up container cold starts.
-       Note: Inference containers don't have root access, so we
-             use the --user flag and add the user package path manually.
+    Uses a persistent cache to speed up container cold starts.
+    Note: Inference containers don't have root access, so we
+          use the --user flag and add the user package path manually.
     """
     if os.path.exists(requirements_path):
         logger.info(f"Installing dependencies from {requirements_path}...")
@@ -48,14 +48,21 @@ def install_requirements(requirements_path):
         os.environ["PIP_CACHE_DIR"] = pip_cache_dir
 
         try:
-            subprocess.check_call([
-                sys.executable, "-m", "pip", "install",
-                "--cache-dir", pip_cache_dir,  # Enable caching
-                "--disable-pip-version-check",
-                "--no-warn-script-location",
-                "--user",
-                "-r", requirements_path
-            ])
+            subprocess.check_call(
+                [
+                    sys.executable,
+                    "-m",
+                    "pip",
+                    "install",
+                    "--cache-dir",
+                    pip_cache_dir,  # Enable caching
+                    "--disable-pip-version-check",
+                    "--no-warn-script-location",
+                    "--user",
+                    "-r",
+                    requirements_path,
+                ]
+            )
             # Ensure Python can find user-installed packages
             sys.path.append(site.getusersitepackages())
             logger.info("Requirements installed successfully.")

From 9da5875326e0674e5590f56e2dd31402b5bba94d Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 13:09:06 -0700
Subject: [PATCH 33/35] new version of rdkit

---
 applications/compound_explorer/requirements.txt | 2 +-
 pyproject.toml                                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/compound_explorer/requirements.txt b/applications/compound_explorer/requirements.txt
index 6f047826d..1fc2cecb3 100644
--- a/applications/compound_explorer/requirements.txt
+++ b/applications/compound_explorer/requirements.txt
@@ -18,7 +18,7 @@ dash-bootstrap-templates >= 1.3.0
 dash_ag_grid
 tabulate >= 0.9.0
 shap>=0.43.0
-rdkit>=2024.3.2
+rdkit>=2024.9.5
 mordredcommunity>=2.0.6
 networkx>=3.2
 matplotlib>=3.9.2
diff --git a/pyproject.toml b/pyproject.toml
index 85085806e..20a2aca0c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ dependencies = [
     "scikit-learn >=1.5.2",
     "joblib >= 1.3.2",
     "requests >= 2.26.0",
-    "rdkit>=2024.3.2",
+    "rdkit>=2024.9.5",
     "mordredcommunity>=2.0.6",
 ]
 

From 4911c052135131629553cd55178b8db1dd34cf7b Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 13:10:12 -0700
Subject: [PATCH 34/35] unlocking scikit-learn version

---
 applications/compound_explorer/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/compound_explorer/requirements.txt b/applications/compound_explorer/requirements.txt
index 1fc2cecb3..0171f8e8a 100644
--- a/applications/compound_explorer/requirements.txt
+++ b/applications/compound_explorer/requirements.txt
@@ -8,7 +8,7 @@ sagemaker >= 2.143
 cryptography>=42.0.5
 ipython>=8.17.2
 xgboost>=2.0.3
-scikit-learn >=1.4.2, <= 1.5.2
+scikit-learn >=1.5.2
 joblib>=1.3.2
 requests>=2.32.0
 plotly >= 5.18.0

From 7b44ec883aa6e0ad1b40e2c95db3d56fc3f3bce6 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Sun, 2 Mar 2025 13:43:16 -0700
Subject: [PATCH 35/35] fix test

---
 tests/specific/capital_tests.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/specific/capital_tests.py b/tests/specific/capital_tests.py
index fcd64a87f..4ac939f74 100644
--- a/tests/specific/capital_tests.py
+++ b/tests/specific/capital_tests.py
@@ -6,8 +6,7 @@
 @pytest.mark.long
 def test():
     # Create a new Data Source from an S3 Path (or a local file)
-    source_path = "s3://workbench-public-data/common/aBaLone.CSV"
-    # source_path = "/full/path/to/local/file.csv"
+    source_path = "s3://workbench-public-data/common/abalone.csv"
     my_data = DataSource(source_path)
     pprint(my_data.summary())
     pprint(my_data.details())