From 39282be98f9b09fe9bfb8eb39d21ac1a3301c319 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Fri, 28 Feb 2025 19:30:23 -0700 Subject: [PATCH 01/35] adding new model images for training and inference --- model_docker_images/Readme.md | 0 model_docker_images/inference/Dockerfile | 14 ++ model_docker_images/inference/main.py | 142 ++++++++++++++ .../inference/requirements.txt | 7 + .../inference/run_inference_container.sh | 17 ++ model_docker_images/scripts/build_deploy.sh | 178 ++++++++++++++++++ model_docker_images/scripts/test_inference.py | 91 +++++++++ model_docker_images/scripts/test_training.py | 149 +++++++++++++++ model_docker_images/training/Dockerfile | 14 ++ model_docker_images/training/requirements.txt | 5 + .../training/run_training_container.sh | 17 ++ model_docker_images/training/train.py | 154 +++++++++++++++ 12 files changed, 788 insertions(+) create mode 100644 model_docker_images/Readme.md create mode 100644 model_docker_images/inference/Dockerfile create mode 100644 model_docker_images/inference/main.py create mode 100644 model_docker_images/inference/requirements.txt create mode 100755 model_docker_images/inference/run_inference_container.sh create mode 100755 model_docker_images/scripts/build_deploy.sh create mode 100644 model_docker_images/scripts/test_inference.py create mode 100644 model_docker_images/scripts/test_training.py create mode 100644 model_docker_images/training/Dockerfile create mode 100644 model_docker_images/training/requirements.txt create mode 100644 model_docker_images/training/run_training_container.sh create mode 100644 model_docker_images/training/train.py diff --git a/model_docker_images/Readme.md b/model_docker_images/Readme.md new file mode 100644 index 000000000..e69de29bb diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile new file mode 100644 index 000000000..5130b7831 --- /dev/null +++ b/model_docker_images/inference/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.12-slim + +# Copy requirements file +COPY requirements.txt /tmp/ + +# Install dependencies +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +# Copy your server code +COPY main.py /app/ +WORKDIR /app + +# Run the API server +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] \ No newline at end of file diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py new file mode 100644 index 000000000..75b369396 --- /dev/null +++ b/model_docker_images/inference/main.py @@ -0,0 +1,142 @@ +from fastapi import FastAPI, Request, Response +from contextlib import asynccontextmanager +import os +import json +import numpy as np +import pandas as pd +import joblib +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Model will be accessible globally +model = None +model_metadata = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + # Load model on startup + global model, model_metadata + + # SageMaker model path + model_path = os.environ.get('MODEL_PATH', '/opt/ml/model') + + try: + logger.info(f"Loading model from {model_path}") + model_file = os.path.join(model_path, 'model.joblib') + + # Check if model file exists + if not os.path.exists(model_file): + logger.warning(f"Model file not found at {model_file}") + # List directory contents for debugging + if os.path.exists(model_path): + logger.info(f"Contents of {model_path}: {os.listdir(model_path)}") + else: + logger.warning(f"Model directory {model_path} not found") + + # For testing only - create a dummy model + logger.warning("Creating a dummy model for testing") + import xgboost as xgb + model = xgb.XGBRegressor() + model.fit(np.array([[1, 2, 3]]), np.array([1])) + else: + # Load the actual model + logger.info(f"Loading model from {model_file}") + model = joblib.load(model_file) + logger.info(f"Model loaded successfully: {type(model)}") + + # Load metadata if available + try: + metadata_file = os.path.join(model_path, 'metadata.json') + if os.path.exists(metadata_file): + with open(metadata_file, 'r') as f: + model_metadata = json.load(f) + logger.info(f"Loaded model metadata: {model_metadata}") + else: + logger.warning(f"Metadata file not found at {metadata_file}") + model_metadata = {'feature_names': None} + except Exception as e: + logger.error(f"Error loading model metadata: {e}") + model_metadata = {'feature_names': None} + except Exception as e: + logger.error(f"Error loading model: {e}", exc_info=True) + # Provide a fallback model for testing + import xgboost as xgb + model = xgb.XGBRegressor() + model.fit(np.array([[1, 2, 3]]), np.array([1])) + model_metadata = {'feature_names': None} + + logger.info("Model initialization complete") + yield + + # Cleanup on shutdown if needed + logger.info("Cleaning up resources") + + +app = FastAPI(lifespan=lifespan) + + +@app.get('/ping') +def ping(): + # SageMaker health check - return 200 if model is loaded + if model is not None: + return Response(status_code=200) + return Response(status_code=404) + + +@app.post('/invocations') +async def invoke(request: Request): + logger.info("Received inference request") + content_type = request.headers.get('Content-Type', '') + accept_type = request.headers.get('Accept', '') + + logger.info(f"Content-Type: {content_type}, Accept: {accept_type}") + + # Get the data + body = await request.body() + + try: + # Handle different content types + if content_type == 'text/csv': + # Parse CSV data + s = body.decode('utf-8') + data = pd.read_csv(pd.StringIO(s), header=None) + logger.info(f"Parsed CSV data with shape: {data.shape}") + else: + # Default to JSON + json_str = body.decode('utf-8') + logger.info(f"Raw JSON input: {json_str}") + data_json = json.loads(json_str) + logger.info(f"Parsed JSON data: {data_json}") + # Convert to DataFrame if it's not already + if not isinstance(data_json, pd.DataFrame): + data = pd.DataFrame(data_json) + else: + data = data_json + + # Make prediction + logger.info(f"Making prediction with data shape: {data.shape}") + predictions = model.predict(data) + logger.info(f"Prediction successful, result shape: {len(predictions) if hasattr(predictions, '__len__') else 'scalar'}") + + # Always return JSON unless explicitly requested as CSV + if accept_type == 'text/csv': + result = pd.DataFrame(predictions).to_csv(header=False, index=False) + logger.info(f"Returning CSV response: {result}") + return Response(content=result, media_type='text/csv') + else: + # Default to JSON for everything else + result = json.dumps({'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions)}) + logger.info(f"Returning JSON response: {result}") + return Response(content=result, media_type='application/json') + + except Exception as e: + logger.error(f"Error during inference: {e}", exc_info=True) + return Response( + content=json.dumps({"error": str(e)}), + status_code=500, + media_type="application/json" + ) diff --git a/model_docker_images/inference/requirements.txt b/model_docker_images/inference/requirements.txt new file mode 100644 index 000000000..ea8a26be8 --- /dev/null +++ b/model_docker_images/inference/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.115.10 +uvicorn==0.34.0 +scikit-learn==1.6.1 +xgboost-cpu==2.1.4 +pandas==2.2.3 +awswrangler==3.11.0 +joblib==1.4.2 \ No newline at end of file diff --git a/model_docker_images/inference/run_inference_container.sh b/model_docker_images/inference/run_inference_container.sh new file mode 100755 index 000000000..e643f3260 --- /dev/null +++ b/model_docker_images/inference/run_inference_container.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +echo "๐Ÿš€ Starting AWS Model Inference Container..." +docker run -d -p 8080:8080 --name aws_model_test aws_model_image:0.1 + +echo "โณ Waiting for server to initialize (5 seconds)..." +sleep 5 + +echo "๐Ÿงช Running tests against the server..." +python test_inference.py + +echo "๐Ÿงน Cleaning up - stopping and removing container..." +docker stop aws_model_test +docker rm aws_model_test + +echo "โœ… Done!" \ No newline at end of file diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh new file mode 100755 index 000000000..236dc3dcf --- /dev/null +++ b/model_docker_images/scripts/build_deploy.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +# Get the parent directory (project root) +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# Configuration +TRAINING_DIR="$PROJECT_ROOT/training" +INFERENCE_DIR="$PROJECT_ROOT/inference" +TRAINING_IMAGE="aws_model_training" +INFERENCE_IMAGE="aws_model_inference" +IMAGE_VERSION=${1:-"0.1"} + +# Expect AWS_PROFILE to be set in the environment when deploying +if [ "$2" == "--deploy" ]; then + : "${AWS_PROFILE:?AWS_PROFILE environment variable is not set.}" +fi + +# Define the regions to deploy to. +REGION_LIST=("us-east-1" "us-west-2") + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Parse arguments +DEPLOY=false +LATEST=false +for arg in "$@"; do + case $arg in + --deploy) + DEPLOY=true + ;; + --latest) + LATEST=true + ;; + *) + ;; + esac +done + +# Function to build a Docker image +build_image() { + local dir=$1 + local image_name=$2 + local tag=$3 + local full_name="${image_name}:${tag}" + + echo -e "${YELLOW}Building image: ${full_name}${NC}" + + # Check if Dockerfile exists + if [ ! -f "$dir/Dockerfile" ]; then + echo "โŒ Error: Dockerfile not found in $dir" + return 1 + fi + + # Build the image for AMD64 architecture + echo "Building local Docker image ${full_name} for linux/amd64..." + docker build --platform linux/amd64 -t $full_name $dir + + echo -e "${GREEN}โœ… Successfully built: ${full_name}${NC}" + return 0 +} + +# Function to deploy an image to ECR +deploy_image() { + local image_name=$1 + local tag=$2 + local use_latest=$3 + local full_name="${image_name}:${tag}" + + for REGION in "${REGION_LIST[@]}"; do + echo "Processing region: ${REGION}" + # Construct the ECR repository URL (using your account ID 507740646243) + ECR_REPO="507740646243.dkr.ecr.${REGION}.amazonaws.com/model_images/${image_name}" + AWS_ECR_IMAGE="${ECR_REPO}:${tag}" + + echo "Logging in to AWS ECR in ${REGION}..." + aws ecr get-login-password --region ${REGION} --profile ${AWS_PROFILE} | \ + docker login --username AWS --password-stdin ${ECR_REPO} + + echo "Tagging image for AWS ECR as ${AWS_ECR_IMAGE}..." + docker tag ${full_name} ${AWS_ECR_IMAGE} + + echo "Pushing Docker image to AWS ECR: ${AWS_ECR_IMAGE}..." + docker push ${AWS_ECR_IMAGE} + + if [ "$use_latest" = true ]; then + AWS_ECR_LATEST="${ECR_REPO}:latest" + echo "Tagging AWS ECR image as latest: ${AWS_ECR_LATEST}..." + docker tag ${full_name} ${AWS_ECR_LATEST} + echo "Pushing Docker image to AWS ECR: ${AWS_ECR_LATEST}..." + docker push ${AWS_ECR_LATEST} + fi + done +} + +# Build training image +echo "======================================" +echo "๐Ÿ—๏ธ Building training container" +echo "======================================" +build_image "$TRAINING_DIR" "$TRAINING_IMAGE" "$IMAGE_VERSION" + +# Build inference image +echo "======================================" +echo "๐Ÿ—๏ธ Building inference container" +echo "======================================" +build_image "$INFERENCE_DIR" "$INFERENCE_IMAGE" "$IMAGE_VERSION" + +echo "======================================" +echo -e "${GREEN}โœ… All builds completed successfully!${NC}" +echo "======================================" + +if [ "$DEPLOY" = true ]; then + echo "======================================" + echo "๐Ÿš€ Deploying containers to ECR" + echo "======================================" + + # Deploy training image + echo "Deploying training image..." + deploy_image "$TRAINING_IMAGE" "$IMAGE_VERSION" "$LATEST" + + # Deploy inference image + echo "Deploying inference image..." + deploy_image "$INFERENCE_IMAGE" "$IMAGE_VERSION" "$LATEST" + + echo "======================================" + echo -e "${GREEN}โœ… Deployment complete!${NC}" + echo "======================================" +else + echo "Local build complete. Use --deploy to push the images to AWS ECR in regions: ${REGION_LIST[*]}." + + # Print information about the built images + echo "======================================" + echo "๐Ÿ“‹ Image information:" + echo "Training image: ${TRAINING_IMAGE}:${IMAGE_VERSION}" + echo "Inference image: ${INFERENCE_IMAGE}:${IMAGE_VERSION}" + echo "======================================" + + # Ask if user wants to test the containers + read -p "Do you want to test the containers? (y/n) " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + # Test training container + echo "======================================" + echo "๐Ÿงช Testing training container" + echo "======================================" + python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}" + + # Test inference container + echo "======================================" + echo "๐Ÿงช Testing inference container" + echo "======================================" + + # Start the inference container in the background + echo "Starting inference container..." + CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}") + + # Wait for the container to initialize + echo "Waiting for server to initialize (5 seconds)..." + sleep 5 + + # Run the test + python "$SCRIPT_DIR/test_inference.py" + + # Stop and remove the container + echo "Stopping inference container..." + docker stop $CONTAINER_ID + docker rm $CONTAINER_ID + + echo "======================================" + echo -e "${GREEN}โœ… Testing completed!${NC}" + echo "======================================" + fi +fi \ No newline at end of file diff --git a/model_docker_images/scripts/test_inference.py b/model_docker_images/scripts/test_inference.py new file mode 100644 index 000000000..223aa1dc7 --- /dev/null +++ b/model_docker_images/scripts/test_inference.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +import requests +import json +import argparse +import time + + +def test_inference_server(host="localhost", port=8080): + """ + Test the inference server running in the Docker container. + """ + base_url = f"http://{host}:{port}" + + # Test 1: Check the health endpoint + print("\n๐Ÿ” Testing /ping endpoint (health check)...") + try: + response = requests.get(f"{base_url}/ping", timeout=5) + if response.status_code == 200: + print("โœ… Health check succeeded") + else: + print(f"โŒ Health check failed with status code: {response.status_code}") + return False + except requests.exceptions.RequestException as e: + print(f"โŒ Health check failed with error: {e}") + print("Is the Docker container running on the specified port?") + return False + + # Test 2: Test the invocations endpoint with simple data + print("\n๐Ÿ” Testing /invocations endpoint with sample data...") + sample_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] + + try: + # Test with JSON data + response = requests.post( + f"{base_url}/invocations", + data=json.dumps(sample_data), + headers={"Content-Type": "application/json", "Accept": "application/json"}, + timeout=5 + ) + + if response.status_code == 200: + print("โœ… Inference request succeeded") + try: + # Parse the JSON response + result = response.json() + print(f"๐Ÿ“Š Response: {result}") + return True + except json.JSONDecodeError as e: + print(f"โŒ Error parsing response as JSON: {e}") + print(f"Raw response: {response.text}") + # Try parsing as CSV + try: + lines = response.text.strip().split('\n') + values = [float(line) for line in lines] + print(f"๐Ÿ“Š CSV Response (converted): {values}") + return True + except Exception: + return False + else: + print(f"โŒ Inference request failed with status code: {response.status_code}") + print(f"Response text: {response.text}") + return False + except requests.exceptions.RequestException as e: + print(f"โŒ Inference request failed with error: {e}") + return False + + print("\n๐ŸŽ‰ All tests passed! Your inference server is working correctly.") + return True + + +def run_docker_command(): + """ + Print the docker run command to help the user start the container. + """ + print("\n๐Ÿ“‹ To run your Docker container, use the following command:") + print("docker run -p 8080:8080 aws_model_inference:latest") + print("\nThis maps port 8080 from the container to port 8080 on your host machine.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test the AWS model inference server") + parser.add_argument("--host", default="localhost", help="Host where the inference server is running") + parser.add_argument("--port", type=int, default=8080, help="Port where the inference server is running") + parser.add_argument("--docker-cmd", action="store_true", help="Print the docker run command") + + args = parser.parse_args() + + if args.docker_cmd: + run_docker_command() + + test_inference_server(args.host, args.port) diff --git a/model_docker_images/scripts/test_training.py b/model_docker_images/scripts/test_training.py new file mode 100644 index 000000000..20956e73a --- /dev/null +++ b/model_docker_images/scripts/test_training.py @@ -0,0 +1,149 @@ +import os +import json +import argparse +import tempfile +import shutil +import subprocess +import numpy as np +import pandas as pd + + +def create_test_data(data_dir, rows=100, cols=5): + """Create synthetic training data for testing.""" + print(f"Creating synthetic training data in {data_dir}") + + # Generate synthetic features and target + X = np.random.randn(rows, cols) + y = 2 * X[:, 0] + 3 * X[:, 1] - 1.5 * X[:, 2] + 0.5 * X[:, 3] - X[:, 4] + np.random.randn(rows) * 0.1 + + # Create dataframe + cols = [f"feature_{i}" for i in range(cols)] + df = pd.DataFrame(X, columns=cols) + df['target'] = y + + # Create train directory + train_dir = os.path.join(data_dir, 'train') + os.makedirs(train_dir, exist_ok=True) + + # Save to CSV + train_file = os.path.join(train_dir, 'train.csv') + df.to_csv(train_file, index=False) + print(f"Saved {rows} rows of training data to {train_file}") + + return train_file + + +def create_hyperparameters(config_dir): + """Create hyperparameters.json file for the training container.""" + print(f"Creating hyperparameters in {config_dir}") + + # Define hyperparameters + hyperparameters = { + "max_depth": "6", + "learning_rate": "0.1", + "n_estimators": "100", + "objective": "reg:squarederror" + } + + # Create config directory + os.makedirs(config_dir, exist_ok=True) + + # Save hyperparameters + hyperparameters_file = os.path.join(config_dir, 'hyperparameters.json') + with open(hyperparameters_file, 'w') as f: + json.dump(hyperparameters, f) + + print(f"Saved hyperparameters to {hyperparameters_file}") + return hyperparameters_file + + +def test_training_container(image_name, temp_dir): + """Run the training container with test data and verify outputs.""" + print(f"\n๐Ÿ”ฌ Testing training container: {image_name}") + + # Create directory structure to mimic SageMaker + input_dir = os.path.join(temp_dir, 'input') + data_dir = os.path.join(input_dir, 'data') + config_dir = os.path.join(input_dir, 'config') + model_dir = os.path.join(temp_dir, 'model') + output_dir = os.path.join(temp_dir, 'output') + + os.makedirs(data_dir, exist_ok=True) + os.makedirs(config_dir, exist_ok=True) + os.makedirs(model_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + # Create test data and hyperparameters + create_test_data(data_dir) + create_hyperparameters(config_dir) + + # Run the container + print("\n๐Ÿ“ฆ Running training container...") + + cmd = [ + "docker", "run", + "--rm", + "-v", f"{temp_dir}:/opt/ml", + image_name + ] + + try: + # Execute the training container + subprocess.run(cmd, check=True) + + # Check if model files were created + model_files = os.listdir(model_dir) + if not model_files: + print("โŒ Training failed: No model files created") + return False + + print(f"โœ… Training succeeded! Model files created: {', '.join(model_files)}") + + # Check for specific expected files + expected_files = ['model.joblib', 'metadata.json'] + missing_files = [f for f in expected_files if f not in model_files] + + if missing_files: + print(f"โš ๏ธ Warning: Some expected files are missing: {', '.join(missing_files)}") + else: + print("โœ… All expected model files were created") + + return True + + except subprocess.CalledProcessError as e: + print(f"โŒ Training failed with error code {e.returncode}") + + # Check if there's a failure file with more details + failure_file = os.path.join(output_dir, 'failure') + if os.path.exists(failure_file): + with open(failure_file, 'r') as f: + failure_content = f.read() + print(f"Error details:\n{failure_content}") + + return False + + +def run_training_test(image_name="aws_model_training:latest"): + """Run the training container test with a temporary directory.""" + print("๐Ÿš€ Starting training container test") + + # Create temporary directory for training data + with tempfile.TemporaryDirectory() as temp_dir: + print(f"Using temporary directory: {temp_dir}") + success = test_training_container(image_name, temp_dir) + + if success: + print("\n๐ŸŽ‰ Training container test passed!") + else: + print("\nโŒ Training container test failed!") + + return success + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test the AWS model training container") + parser.add_argument("--image", default="aws_model_training:latest", + help="Docker image name for the training container") + + args = parser.parse_args() + run_training_test(args.image) \ No newline at end of file diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile new file mode 100644 index 000000000..99f6a16f9 --- /dev/null +++ b/model_docker_images/training/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.12-slim + +# Copy requirements file +COPY requirements.txt /tmp/ + +# Install dependencies +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +# Set up the program in the image +COPY train.py /opt/program/ +WORKDIR /opt/program + +# Set up the entry point +ENTRYPOINT ["python", "train.py"] diff --git a/model_docker_images/training/requirements.txt b/model_docker_images/training/requirements.txt new file mode 100644 index 000000000..b3b7b18dd --- /dev/null +++ b/model_docker_images/training/requirements.txt @@ -0,0 +1,5 @@ +scikit-learn==1.6.1 +xgboost-cpu==2.1.4 +pandas==2.2.3 +awswrangler==3.11.0 +joblib==1.4.2 \ No newline at end of file diff --git a/model_docker_images/training/run_training_container.sh b/model_docker_images/training/run_training_container.sh new file mode 100644 index 000000000..73383fa79 --- /dev/null +++ b/model_docker_images/training/run_training_container.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +PARENT_DIR="$(dirname "$SCRIPT_DIR")" +SCRIPTS_DIR="$PARENT_DIR/scripts" + +# Make sure test_training.py exists +if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then + echo "โŒ Error: test_training.py not found in $SCRIPTS_DIR" + exit 1 +fi + +IMAGE_NAME=${1:-aws_model_training:latest} + +echo "๐Ÿš€ Testing Training Container: $IMAGE_NAME" +python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME" \ No newline at end of file diff --git a/model_docker_images/training/train.py b/model_docker_images/training/train.py new file mode 100644 index 000000000..98d97be0b --- /dev/null +++ b/model_docker_images/training/train.py @@ -0,0 +1,154 @@ +import os +import json +import sys +import traceback +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import xgboost as xgb +import joblib + +# SageMaker paths +prefix = '/opt/ml/' +input_path = prefix + 'input/data' +model_path = os.path.join(prefix, 'model') +param_path = os.path.join(prefix, 'input/config/hyperparameters.json') +output_path = os.path.join(prefix, 'output') + +# Channel names for training and validation data +training_channel_name = 'train' +eval_channel_name = 'validation' + + +# Load hyperparameters +def load_hyperparameters(): + with open(param_path, 'r') as tc: + hyperparameters = json.load(tc) + + # Convert hyperparameters from strings to appropriate types + processed_params = {} + for key, value in hyperparameters.items(): + # Try to convert to int, float, or bool as appropriate + try: + # Convert to int if it looks like an int + if value.isdigit() or (value.startswith('-') and value[1:].isdigit()): + processed_params[key] = int(value) + # Convert to float if it has a decimal point + elif '.' in value: + try: + processed_params[key] = float(value) + except ValueError: + processed_params[key] = value + # Handle boolean values + elif value.lower() in ['true', 'false']: + processed_params[key] = value.lower() == 'true' + else: + processed_params[key] = value + except (AttributeError, ValueError): + # If conversion fails, keep as string + processed_params[key] = value + + return processed_params + + +# Load training data +def load_data(): + train_path = os.path.join(input_path, training_channel_name) + + # Get all CSV files in training directory + train_files = [os.path.join(train_path, file) for file in os.listdir(train_path) + if file.endswith('.csv')] + + if not train_files: + raise ValueError(f"No CSV files found in {train_path}") + + # Read and concatenate all training files + dfs = [] + for file in train_files: + df = pd.read_csv(file) + dfs.append(df) + + if not dfs: + raise ValueError("No valid data found in training files") + + return pd.concat(dfs, ignore_index=True) + + +# Train the model +def train(): + print("Starting the training process") + + try: + # Load hyperparameters + hyperparameters = load_hyperparameters() + print(f"Loaded hyperparameters: {hyperparameters}") + + # Load training data + train_data = load_data() + print(f"Loaded training data with shape: {train_data.shape}") + + # Extract features and target + # Assumes last column is the target + X = train_data.iloc[:, :-1] + y = train_data.iloc[:, -1] + + # Train/test split + X_train, X_val, y_train, y_val = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Configure model parameters from hyperparameters or use defaults + max_depth = hyperparameters.get('max_depth', 6) + learning_rate = hyperparameters.get('learning_rate', 0.1) + n_estimators = hyperparameters.get('n_estimators', 100) + + # Create and train model with a simpler approach + # Removed early stopping and eval_set to ensure compatibility + model = xgb.XGBRegressor( + max_depth=max_depth, + learning_rate=learning_rate, + n_estimators=n_estimators + ) + + print("Training model...") + model.fit(X_train, y_train) + + # Evaluate on validation set + val_score = model.score(X_val, y_val) + print(f"Validation Rยฒ score: {val_score:.4f}") + + # Save the model + os.makedirs(model_path, exist_ok=True) + model_file = os.path.join(model_path, 'model.joblib') + + # Save additional metadata about the model + feature_names = X.columns.tolist() + model_metadata = { + 'feature_names': feature_names, + 'hyperparameters': hyperparameters, + 'validation_score': val_score + } + metadata_file = os.path.join(model_path, 'metadata.json') + + print(f"Saving model to {model_file}") + joblib.dump(model, model_file) + + print(f"Saving metadata to {metadata_file}") + with open(metadata_file, 'w') as f: + json.dump(model_metadata, f) + + print("Training completed successfully") + + except Exception as e: + # Write out an error file + trc = traceback.format_exc() + with open(os.path.join(output_path, 'failure'), 'w') as s: + s.write('Exception during training: ' + str(e) + '\n' + trc) + # Printing this causes the exception to be in the training job logs + print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) + # A non-zero exit code causes the training job to be marked as Failed + sys.exit(255) + + +if __name__ == '__main__': + train() \ No newline at end of file From 8a9a56629f34d7a30c374a44c3b214ab2d7135c0 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Fri, 28 Feb 2025 20:50:41 -0700 Subject: [PATCH 02/35] adding the code/docker for the workbench model image generation (WIP) --- .../inference/run_inference_container.sh | 17 ------- .../inference/test_container.sh | 50 +++++++++++++++++++ model_docker_images/scripts/test_training.py | 1 - model_docker_images/training/Dockerfile | 11 ++-- .../training/run_training_container.sh | 17 ------- .../training/test_container.sh | 25 ++++++++++ model_docker_images/training/train.py | 1 - 7 files changed, 82 insertions(+), 40 deletions(-) delete mode 100755 model_docker_images/inference/run_inference_container.sh create mode 100755 model_docker_images/inference/test_container.sh delete mode 100644 model_docker_images/training/run_training_container.sh create mode 100755 model_docker_images/training/test_container.sh diff --git a/model_docker_images/inference/run_inference_container.sh b/model_docker_images/inference/run_inference_container.sh deleted file mode 100755 index e643f3260..000000000 --- a/model_docker_images/inference/run_inference_container.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -e - -echo "๐Ÿš€ Starting AWS Model Inference Container..." -docker run -d -p 8080:8080 --name aws_model_test aws_model_image:0.1 - -echo "โณ Waiting for server to initialize (5 seconds)..." -sleep 5 - -echo "๐Ÿงช Running tests against the server..." -python test_inference.py - -echo "๐Ÿงน Cleaning up - stopping and removing container..." -docker stop aws_model_test -docker rm aws_model_test - -echo "โœ… Done!" \ No newline at end of file diff --git a/model_docker_images/inference/test_container.sh b/model_docker_images/inference/test_container.sh new file mode 100755 index 000000000..3157b3df7 --- /dev/null +++ b/model_docker_images/inference/test_container.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -e + +# Determine script and project directories +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" +SCRIPTS_DIR="$PROJECT_ROOT/scripts" + +# Default image name +DEFAULT_IMAGE="aws_model_inference:0.1" +IMAGE_NAME=${1:-$DEFAULT_IMAGE} + +# Port to use for testing +PORT=8080 + +echo "๐Ÿ“‹ Inference Container Test Script" +echo "======================================" + +# Make sure test script exists +if [ ! -f "$SCRIPTS_DIR/test_inference.py" ]; then + echo "โŒ Error: test_inference.py not found in $SCRIPTS_DIR" + exit 1 +fi + +# Start the inference container with proper log settings +echo "๐Ÿš€ Starting inference container: $IMAGE_NAME" +CONTAINER_ID=$(docker run -d -p $PORT:$PORT -e PYTHONUNBUFFERED=1 "$IMAGE_NAME") + +# Follow logs in the background +docker logs -f $CONTAINER_ID & +LOGS_PID=$! + +# Ensure container and log process are stopped on script exit +function cleanup { + echo "๐Ÿงน Stopping log process and container..." + kill $LOGS_PID 2>/dev/null || true + docker stop $CONTAINER_ID >/dev/null 2>&1 + docker rm $CONTAINER_ID >/dev/null 2>&1 +} +trap cleanup EXIT + +# Wait for container to initialize +echo "โณ Waiting for server to initialize (5 seconds)..." +sleep 5 + +# Run the test +echo "๐Ÿงช Testing inference container..." +python "$SCRIPTS_DIR/test_inference.py" --host localhost --port $PORT + +echo "======================================" \ No newline at end of file diff --git a/model_docker_images/scripts/test_training.py b/model_docker_images/scripts/test_training.py index 20956e73a..ecba030a1 100644 --- a/model_docker_images/scripts/test_training.py +++ b/model_docker_images/scripts/test_training.py @@ -2,7 +2,6 @@ import json import argparse import tempfile -import shutil import subprocess import numpy as np import pandas as pd diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile index 99f6a16f9..74f6caf44 100644 --- a/model_docker_images/training/Dockerfile +++ b/model_docker_images/training/Dockerfile @@ -6,9 +6,12 @@ COPY requirements.txt /tmp/ # Install dependencies RUN pip install --no-cache-dir -r /tmp/requirements.txt -# Set up the program in the image -COPY train.py /opt/program/ +# Copy the SageMaker entrypoint script +COPY sagemaker_entrypoint.py /opt/program/ WORKDIR /opt/program -# Set up the entry point -ENTRYPOINT ["python", "train.py"] +# Make the entrypoint executable +RUN chmod +x /opt/program/sagemaker_entrypoint.py + +# Set the entrypoint +ENTRYPOINT ["/opt/program/sagemaker_entrypoint.py"] \ No newline at end of file diff --git a/model_docker_images/training/run_training_container.sh b/model_docker_images/training/run_training_container.sh deleted file mode 100644 index 73383fa79..000000000 --- a/model_docker_images/training/run_training_container.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" -PARENT_DIR="$(dirname "$SCRIPT_DIR")" -SCRIPTS_DIR="$PARENT_DIR/scripts" - -# Make sure test_training.py exists -if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then - echo "โŒ Error: test_training.py not found in $SCRIPTS_DIR" - exit 1 -fi - -IMAGE_NAME=${1:-aws_model_training:latest} - -echo "๐Ÿš€ Testing Training Container: $IMAGE_NAME" -python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME" \ No newline at end of file diff --git a/model_docker_images/training/test_container.sh b/model_docker_images/training/test_container.sh new file mode 100755 index 000000000..cdc1382b4 --- /dev/null +++ b/model_docker_images/training/test_container.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +# Determine script and project directories +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" +SCRIPTS_DIR="$PROJECT_ROOT/scripts" + +# Default image name with latest tag +DEFAULT_IMAGE="aws_model_training:0.1" +IMAGE_NAME=${1:-$DEFAULT_IMAGE} + +echo "๐Ÿ“‹ Training Container Test Script" +echo "======================================" + +# Make sure test_training.py exists +if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then + echo "โŒ Error: test_training.py not found in $SCRIPTS_DIR" + exit 1 +fi + +echo "๐Ÿš€ Testing Training Container: $IMAGE_NAME" +python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME" + +echo "======================================" \ No newline at end of file diff --git a/model_docker_images/training/train.py b/model_docker_images/training/train.py index 98d97be0b..d88c2482c 100644 --- a/model_docker_images/training/train.py +++ b/model_docker_images/training/train.py @@ -3,7 +3,6 @@ import sys import traceback import pandas as pd -import numpy as np from sklearn.model_selection import train_test_split import xgboost as xgb import joblib From d25531586893706ab4b653d3df54c8bbdc63031f Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Fri, 28 Feb 2025 20:51:13 -0700 Subject: [PATCH 03/35] adding the code/docker for the workbench model image generation (WIP) --- .../training/sagemaker_entrypoint.py | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 model_docker_images/training/sagemaker_entrypoint.py diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py new file mode 100644 index 000000000..50f3acc4f --- /dev/null +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -0,0 +1,162 @@ +import os +import sys +import json +import tarfile +import subprocess +import logging +import boto3 +from urllib.parse import urlparse + +# Set up logging +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger('sagemaker-entry-point') + + +def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"): + """Download and extract code package from S3.""" + logger.info(f"Downloading source package from {s3_uri}...") + parsed = urlparse(s3_uri) + bucket = parsed.netloc + key = parsed.path.lstrip("/") + local_tar = "/tmp/code_package.tar.gz" + + try: + s3 = boto3.client("s3") + s3.download_file(bucket, key, local_tar) + logger.info(f"Download successful, tar file size: {os.path.getsize(local_tar)} bytes") + + os.makedirs(target_dir, exist_ok=True) + with tarfile.open(local_tar, "r:gz") as tar: + tar.extractall(path=target_dir) + + logger.info(f"Files in {target_dir} after extraction: {os.listdir(target_dir)}") + return target_dir + except Exception as e: + logger.error(f"Error downloading from S3: {str(e)}") + sys.exit(1) + + +def install_requirements(requirements_path): + """Install Python dependencies from requirements file.""" + if os.path.exists(requirements_path): + logger.info(f"Installing dependencies from {requirements_path}...") + try: + subprocess.check_call([ + sys.executable, "-m", "pip", "install", "-r", requirements_path + ]) + logger.info("Requirements installation completed successfully.") + except subprocess.CalledProcessError as e: + logger.error(f"Error installing requirements: {str(e)}") + sys.exit(1) + else: + logger.info(f"No requirements file found at {requirements_path}") + + +def setup_sagemaker_environment(): + """Set up SageMaker environment variables based on /opt/ml structure.""" + env_vars = { + "SM_MODEL_DIR": "/opt/ml/model", + "SM_OUTPUT_DATA_DIR": "/opt/ml/output/data", + "SM_CHANNEL_TRAIN": "/opt/ml/input/data/train", + "SM_OUTPUT_DIR": "/opt/ml/output", + "SM_INPUT_DIR": "/opt/ml/input", + "SM_INPUT_CONFIG_DIR": "/opt/ml/input/config" + } + + # Set the environment variables + for key, value in env_vars.items(): + os.environ[key] = str(value) + + logger.info(f"Set SageMaker environment variables: {list(env_vars.keys())}") + + +def main(): + logger.info("Starting SageMaker container entry point") + + # Read hyperparameters + hyperparameters_path = '/opt/ml/input/config/hyperparameters.json' + if not os.path.exists(hyperparameters_path): + logger.error("Error: hyperparameters.json not found!") + sys.exit(1) + + with open(hyperparameters_path, 'r') as f: + hyperparameters = json.load(f) + logger.info(f"Hyperparameters: {hyperparameters}") + + # Set up environment based on hyperparameters + # Get program name from hyperparameters or environment variable + if 'sagemaker_program' in hyperparameters: + program = hyperparameters['sagemaker_program'].strip('"\'') + os.environ['SAGEMAKER_PROGRAM'] = program + elif 'SAGEMAKER_PROGRAM' in os.environ: + program = os.environ['SAGEMAKER_PROGRAM'] + else: + logger.error("Error: sagemaker_program not found in hyperparameters or environment!") + sys.exit(1) + + logger.info(f"Using program: {program}") + + # Get source directory from hyperparameters + if 'sagemaker_submit_directory' in hyperparameters: + s3_source = hyperparameters['sagemaker_submit_directory'].strip('"\'') + logger.info(f"Downloading source from: {s3_source}") + + # Download and extract source code + submit_dir = download_and_extract_s3(s3_source) + + # Install requirements + install_requirements(os.path.join(submit_dir, "requirements.txt")) + else: + logger.info("No sagemaker_submit_directory specified, assuming code is already in /opt/ml/code") + submit_dir = "/opt/ml/code" + + # Check if directory exists + if not os.path.exists(submit_dir): + logger.error(f"Code directory {submit_dir} does not exist!") + sys.exit(1) + + # List code directory contents for debugging + logger.info(f"Contents of {submit_dir}:") + try: + output = subprocess.check_output(['ls', '-la', submit_dir]) + logger.info(output.decode('utf-8')) + except Exception as e: + logger.error(f"Failed to list directory: {e}") + + # Set up SageMaker environment variables + setup_sagemaker_environment() + + # Ensure directories exist + os.makedirs(os.environ["SM_MODEL_DIR"], exist_ok=True) + os.makedirs(os.environ["SM_OUTPUT_DATA_DIR"], exist_ok=True) + + # Locate entry point script + entry_point = os.path.join(submit_dir, program) + if not os.path.exists(entry_point): + logger.error(f"Error: Entry point '{entry_point}' not found!") + sys.exit(1) + + logger.info(f"Running entry point: {entry_point}") + sys.stdout.flush() + + # Execute with proper arguments + cmd = [ + sys.executable, entry_point, + "--model-dir", os.environ["SM_MODEL_DIR"], + "--output-data-dir", os.environ["SM_OUTPUT_DATA_DIR"], + "--train", os.environ["SM_CHANNEL_TRAIN"] + ] + + logger.info(f"Executing: {' '.join(cmd)}") + + # Replace current process with the entry point script and arguments + try: + os.execv(sys.executable, cmd) + except Exception as e: + logger.error(f"Failed to execute entry point: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() From 49f0032b7be3b7e93f812e0b3841d0b77f310626 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:09:45 -0700 Subject: [PATCH 04/35] adding the code/docker for the workbench model image generation (WIP) --- model_docker_images/inference/Dockerfile | 3 + model_docker_images/scripts/build_deploy.sh | 41 +- model_docker_images/scripts/test_training.py | 148 ------- model_docker_images/tests/data/abalone_sm.csv | 100 +++++ .../tests/example_model_script.py | 379 ++++++++++++++++++ model_docker_images/tests/run_tests.sh | 48 +++ .../{scripts => tests}/test_inference.py | 0 model_docker_images/tests/test_training.py | 167 ++++++++ model_docker_images/training/Dockerfile | 3 + .../training/sagemaker_entrypoint.py | 20 +- .../training/test_container.sh | 25 -- model_docker_images/training/train.py | 153 ------- 12 files changed, 719 insertions(+), 368 deletions(-) delete mode 100644 model_docker_images/scripts/test_training.py create mode 100644 model_docker_images/tests/data/abalone_sm.csv create mode 100644 model_docker_images/tests/example_model_script.py create mode 100644 model_docker_images/tests/run_tests.sh rename model_docker_images/{scripts => tests}/test_inference.py (100%) create mode 100644 model_docker_images/tests/test_training.py delete mode 100755 model_docker_images/training/test_container.sh delete mode 100644 model_docker_images/training/train.py diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile index 5130b7831..a09da2460 100644 --- a/model_docker_images/inference/Dockerfile +++ b/model_docker_images/inference/Dockerfile @@ -1,5 +1,8 @@ FROM python:3.12-slim +# Install Vim +RUN apt-get update && apt-get install -y vim + # Copy requirements file COPY requirements.txt /tmp/ diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh index 236dc3dcf..6ca52bdc2 100755 --- a/model_docker_images/scripts/build_deploy.sh +++ b/model_docker_images/scripts/build_deploy.sh @@ -74,7 +74,7 @@ deploy_image() { for REGION in "${REGION_LIST[@]}"; do echo "Processing region: ${REGION}" - # Construct the ECR repository URL (using your account ID 507740646243) + # Construct the ECR repository URL ECR_REPO="507740646243.dkr.ecr.${REGION}.amazonaws.com/model_images/${image_name}" AWS_ECR_IMAGE="${ECR_REPO}:${tag}" @@ -140,39 +140,6 @@ else echo "Inference image: ${INFERENCE_IMAGE}:${IMAGE_VERSION}" echo "======================================" - # Ask if user wants to test the containers - read -p "Do you want to test the containers? (y/n) " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - # Test training container - echo "======================================" - echo "๐Ÿงช Testing training container" - echo "======================================" - python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}" - - # Test inference container - echo "======================================" - echo "๐Ÿงช Testing inference container" - echo "======================================" - - # Start the inference container in the background - echo "Starting inference container..." - CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}") - - # Wait for the container to initialize - echo "Waiting for server to initialize (5 seconds)..." - sleep 5 - - # Run the test - python "$SCRIPT_DIR/test_inference.py" - - # Stop and remove the container - echo "Stopping inference container..." - docker stop $CONTAINER_ID - docker rm $CONTAINER_ID - - echo "======================================" - echo -e "${GREEN}โœ… Testing completed!${NC}" - echo "======================================" - fi -fi \ No newline at end of file + # Inform about testing option + echo "To test these containers, run: $PROJECT_ROOT/tests/scripts/run_tests.sh ${IMAGE_VERSION}" +fi diff --git a/model_docker_images/scripts/test_training.py b/model_docker_images/scripts/test_training.py deleted file mode 100644 index ecba030a1..000000000 --- a/model_docker_images/scripts/test_training.py +++ /dev/null @@ -1,148 +0,0 @@ -import os -import json -import argparse -import tempfile -import subprocess -import numpy as np -import pandas as pd - - -def create_test_data(data_dir, rows=100, cols=5): - """Create synthetic training data for testing.""" - print(f"Creating synthetic training data in {data_dir}") - - # Generate synthetic features and target - X = np.random.randn(rows, cols) - y = 2 * X[:, 0] + 3 * X[:, 1] - 1.5 * X[:, 2] + 0.5 * X[:, 3] - X[:, 4] + np.random.randn(rows) * 0.1 - - # Create dataframe - cols = [f"feature_{i}" for i in range(cols)] - df = pd.DataFrame(X, columns=cols) - df['target'] = y - - # Create train directory - train_dir = os.path.join(data_dir, 'train') - os.makedirs(train_dir, exist_ok=True) - - # Save to CSV - train_file = os.path.join(train_dir, 'train.csv') - df.to_csv(train_file, index=False) - print(f"Saved {rows} rows of training data to {train_file}") - - return train_file - - -def create_hyperparameters(config_dir): - """Create hyperparameters.json file for the training container.""" - print(f"Creating hyperparameters in {config_dir}") - - # Define hyperparameters - hyperparameters = { - "max_depth": "6", - "learning_rate": "0.1", - "n_estimators": "100", - "objective": "reg:squarederror" - } - - # Create config directory - os.makedirs(config_dir, exist_ok=True) - - # Save hyperparameters - hyperparameters_file = os.path.join(config_dir, 'hyperparameters.json') - with open(hyperparameters_file, 'w') as f: - json.dump(hyperparameters, f) - - print(f"Saved hyperparameters to {hyperparameters_file}") - return hyperparameters_file - - -def test_training_container(image_name, temp_dir): - """Run the training container with test data and verify outputs.""" - print(f"\n๐Ÿ”ฌ Testing training container: {image_name}") - - # Create directory structure to mimic SageMaker - input_dir = os.path.join(temp_dir, 'input') - data_dir = os.path.join(input_dir, 'data') - config_dir = os.path.join(input_dir, 'config') - model_dir = os.path.join(temp_dir, 'model') - output_dir = os.path.join(temp_dir, 'output') - - os.makedirs(data_dir, exist_ok=True) - os.makedirs(config_dir, exist_ok=True) - os.makedirs(model_dir, exist_ok=True) - os.makedirs(output_dir, exist_ok=True) - - # Create test data and hyperparameters - create_test_data(data_dir) - create_hyperparameters(config_dir) - - # Run the container - print("\n๐Ÿ“ฆ Running training container...") - - cmd = [ - "docker", "run", - "--rm", - "-v", f"{temp_dir}:/opt/ml", - image_name - ] - - try: - # Execute the training container - subprocess.run(cmd, check=True) - - # Check if model files were created - model_files = os.listdir(model_dir) - if not model_files: - print("โŒ Training failed: No model files created") - return False - - print(f"โœ… Training succeeded! Model files created: {', '.join(model_files)}") - - # Check for specific expected files - expected_files = ['model.joblib', 'metadata.json'] - missing_files = [f for f in expected_files if f not in model_files] - - if missing_files: - print(f"โš ๏ธ Warning: Some expected files are missing: {', '.join(missing_files)}") - else: - print("โœ… All expected model files were created") - - return True - - except subprocess.CalledProcessError as e: - print(f"โŒ Training failed with error code {e.returncode}") - - # Check if there's a failure file with more details - failure_file = os.path.join(output_dir, 'failure') - if os.path.exists(failure_file): - with open(failure_file, 'r') as f: - failure_content = f.read() - print(f"Error details:\n{failure_content}") - - return False - - -def run_training_test(image_name="aws_model_training:latest"): - """Run the training container test with a temporary directory.""" - print("๐Ÿš€ Starting training container test") - - # Create temporary directory for training data - with tempfile.TemporaryDirectory() as temp_dir: - print(f"Using temporary directory: {temp_dir}") - success = test_training_container(image_name, temp_dir) - - if success: - print("\n๐ŸŽ‰ Training container test passed!") - else: - print("\nโŒ Training container test failed!") - - return success - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Test the AWS model training container") - parser.add_argument("--image", default="aws_model_training:latest", - help="Docker image name for the training container") - - args = parser.parse_args() - run_training_test(args.image) \ No newline at end of file diff --git a/model_docker_images/tests/data/abalone_sm.csv b/model_docker_images/tests/data/abalone_sm.csv new file mode 100644 index 000000000..0198e6bc8 --- /dev/null +++ b/model_docker_images/tests/data/abalone_sm.csv @@ -0,0 +1,100 @@ +sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,class_number_of_rings,auto_id +M,0.53,0.43,0.135,0.879,0.28,0.2165,0.25,10,3400 +M,0.645,0.49,0.16,1.251,0.5355,0.3345,0.3165,9,2614 +F,0.69,0.545,0.205,1.933,0.7855,0.429,0.498,13,2618 +I,0.55,0.4,0.135,0.717,0.3315,0.1495,0.221,9,3663 +I,0.415,0.33,0.09,0.3595,0.17,0.081,0.09,6,920 +M,0.465,0.36,0.105,0.488,0.188,0.0845,0.19,10,3137 +M,0.59,0.475,0.155,0.857,0.356,0.174,0.28,13,2225 +M,0.52,0.425,0.155,0.7735,0.297,0.123,0.255,17,3271 +M,0.695,0.525,0.175,1.742,0.696,0.389,0.505,12,2621 +F,0.585,0.42,0.155,1.034,0.437,0.2225,0.32,11,3667 +I,0.525,0.385,0.13,0.607,0.2355,0.125,0.195,8,2754 +F,0.675,0.51,0.185,1.473,0.6295,0.3025,0.4245,11,1971 +I,0.435,0.335,0.105,0.3535,0.156,0.05,0.1135,7,3016 +I,0.435,0.345,0.12,0.3215,0.13,0.056,0.1185,7,1844 +I,0.525,0.4,0.125,0.5655,0.2435,0.119,0.175,8,3810 +F,0.52,0.395,0.18,0.64,0.158,0.11,0.245,22,675 +I,0.415,0.315,0.105,0.33,0.1405,0.0705,0.095,6,2508 +I,0.415,0.325,0.115,0.3285,0.1405,0.051,0.106,12,2378 +I,0.575,0.44,0.15,0.983,0.486,0.215,0.239,8,3666 +I,0.55,0.435,0.14,0.7535,0.3285,0.1555,0.2325,10,1314 +M,0.675,0.515,0.15,1.312,0.556,0.2845,0.4115,11,1970 +I,0.43,0.325,0.09,0.425,0.217,0.087,0.095,7,926 +F,0.67,0.54,0.165,1.5015,0.518,0.358,0.505,14,420 +M,0.745,0.565,0.215,1.931,0.896,0.4585,0.5,11,1205 +M,0.57,0.45,0.14,0.9275,0.477,0.1605,0.2515,8,3819 +F,0.605,0.48,0.175,1.1685,0.4815,0.2305,0.356,9,3822 +M,0.48,0.375,0.115,0.6765,0.3205,0.1065,0.17,6,949 +F,0.58,0.45,0.17,0.9705,0.4615,0.232,0.248,9,2908 +I,0.42,0.31,0.095,0.279,0.1255,0.051,0.088,6,1078 +M,0.705,0.56,0.22,1.981,0.8175,0.3085,0.76,14,168 +F,0.59,0.465,0.16,1.1005,0.506,0.2525,0.295,13,2259 +I,0.33,0.25,0.095,0.2085,0.102,0.0395,0.052,7,1220 +F,0.595,0.465,0.155,1.026,0.4645,0.112,0.305,12,1351 +I,0.36,0.275,0.11,0.2335,0.095,0.0525,0.085,10,440 +I,0.46,0.35,0.115,0.4155,0.18,0.098,0.1175,7,1092 +F,0.675,0.52,0.175,1.494,0.7365,0.3055,0.37,9,4100 +F,0.575,0.46,0.165,1.065,0.4985,0.2145,0.2815,8,3454 +F,0.395,0.3,0.105,0.3375,0.1435,0.0755,0.098,12,3323 +M,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,18,32 +I,0.52,0.395,0.125,0.5805,0.2445,0.146,0.165,9,1864 +I,0.585,0.475,0.16,1.0505,0.48,0.234,0.285,10,1342 +M,0.5,0.375,0.15,0.636,0.2535,0.145,0.19,10,690 +I,0.51,0.395,0.155,0.5395,0.2465,0.1085,0.167,8,2650 +I,0.315,0.23,0.08,0.1375,0.0545,0.031,0.0445,5,1217 +F,0.47,0.355,0.13,0.5465,0.2005,0.126,0.185,14,564 +M,0.58,0.47,0.165,1.041,0.54,0.166,0.279,9,3570 +F,0.55,0.425,0.135,0.8515,0.362,0.196,0.27,14,41 +F,0.47,0.36,0.13,0.472,0.182,0.114,0.15,10,304 +I,0.505,0.39,0.15,0.685,0.362,0.131,0.156,8,962 +F,0.55,0.44,0.135,0.8435,0.434,0.1995,0.185,8,2659 +I,0.45,0.345,0.135,0.443,0.1975,0.0875,0.1175,14,571 +I,0.44,0.355,0.165,0.435,0.159,0.105,0.14,16,2402 +M,0.4,0.32,0.095,0.303,0.1335,0.06,0.1,7,51 +I,0.295,0.225,0.09,0.1105,0.0405,0.0245,0.032,7,709 +I,0.445,0.355,0.095,0.3615,0.1415,0.0785,0.12,8,3540 +I,0.47,0.345,0.14,0.4615,0.229,0.1105,0.116,9,1452 +M,0.635,0.525,0.205,1.484,0.55,0.3115,0.43,20,278 +I,0.415,0.315,0.1,0.3645,0.1765,0.0795,0.095,8,2632 +I,0.435,0.335,0.11,0.383,0.1555,0.0675,0.135,12,2374 +F,0.525,0.415,0.15,0.7155,0.2355,0.171,0.27,13,3949 +I,0.55,0.445,0.145,0.783,0.3045,0.157,0.265,11,3036 +F,0.57,0.46,0.17,1.1,0.4125,0.2205,0.38,14,2252 +M,0.515,0.4,0.14,0.6335,0.288,0.145,0.168,9,2020 +F,0.525,0.405,0.115,0.72,0.3105,0.1915,0.2,14,3192 +F,0.565,0.4,0.13,0.6975,0.3075,0.1665,0.18,8,983 +M,0.675,0.515,0.145,1.265,0.6025,0.299,0.325,10,3596 +F,0.37,0.29,0.115,0.25,0.111,0.057,0.075,9,591 +F,0.475,0.365,0.13,0.4805,0.1905,0.114,0.1475,12,2422 +F,0.55,0.415,0.18,1.1655,0.502,0.301,0.311,9,3731 +M,0.6,0.475,0.19,1.0875,0.403,0.2655,0.325,14,336 +F,0.44,0.34,0.14,0.482,0.186,0.1085,0.16,9,205 +I,0.315,0.23,0.0,0.134,0.0575,0.0285,0.3505,6,3996 +I,0.325,0.24,0.07,0.152,0.0565,0.0305,0.054,8,2041 +I,0.47,0.345,0.115,0.4885,0.2005,0.108,0.166,11,603 +F,0.505,0.375,0.18,0.568,0.2325,0.1495,0.17,12,343 +M,0.635,0.49,0.16,1.101,0.534,0.1865,0.3455,10,1389 +M,0.535,0.41,0.135,0.862,0.2855,0.1525,0.32,14,738 +F,0.595,0.435,0.15,0.9,0.4175,0.17,0.265,8,1651 +M,0.515,0.4,0.16,0.8175,0.2515,0.156,0.3,23,2436 +M,0.455,0.35,0.11,0.458,0.2,0.111,0.1305,8,3089 +I,0.42,0.315,0.115,0.355,0.1895,0.065,0.087,6,2047 +M,0.465,0.34,0.105,0.486,0.231,0.1035,0.1225,9,2571 +M,0.72,0.565,0.2,2.1055,1.017,0.363,0.494,12,1527 +F,0.54,0.415,0.15,0.8115,0.3875,0.1875,0.2035,9,2833 +F,0.655,0.455,0.17,1.275,0.583,0.303,0.333,8,3621 +M,0.675,0.525,0.185,1.587,0.6935,0.336,0.395,13,356 +F,0.555,0.43,0.135,0.812,0.4055,0.163,0.2215,9,3494 +M,0.41,0.3,0.1,0.301,0.124,0.069,0.09,9,3362 +I,0.4,0.31,0.1,0.2875,0.1145,0.0635,0.095,10,2320 +I,0.32,0.215,0.095,0.305,0.14,0.067,0.0885,6,2975 +I,0.27,0.205,0.05,0.084,0.03,0.0185,0.029,6,3629 +F,0.625,0.5,0.15,0.953,0.3445,0.2235,0.305,15,495 +M,0.59,0.47,0.15,0.9955,0.481,0.232,0.24,8,1152 +M,0.59,0.465,0.14,1.046,0.4695,0.263,0.263,7,2592 +F,0.54,0.42,0.14,0.805,0.369,0.1725,0.21,11,846 +I,0.28,0.2,0.075,0.1225,0.0545,0.0115,0.035,5,2153 +M,0.575,0.47,0.185,0.985,0.3745,0.2175,0.355,10,1636 +M,0.72,0.6,0.235,2.2385,0.984,0.411,0.621,12,3993 +M,0.655,0.53,0.195,1.388,0.567,0.2735,0.41,13,467 diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py new file mode 100644 index 000000000..bb736ac7c --- /dev/null +++ b/model_docker_images/tests/example_model_script.py @@ -0,0 +1,379 @@ +# Template Placeholders +TEMPLATE_PARAMS = { + "model_type": "regressor", + "target_column": "class_number_of_rings", + "feature_list": ['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'auto_id'], + "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/training/abalone-regression", + "train_all_data": False +} + +# Imports for XGB Model +import xgboost as xgb +import awswrangler as wr + +# Model Performance Scores +from sklearn.metrics import ( + mean_absolute_error, + r2_score, + root_mean_squared_error, + precision_recall_fscore_support, + confusion_matrix, +) + +# Classification Encoder +from sklearn.preprocessing import LabelEncoder + +# Scikit Learn Imports +from sklearn.model_selection import train_test_split + +from io import StringIO +import json +import argparse +import joblib +import os +import pandas as pd +from typing import List + + +# Function to check if dataframe is empty +def check_dataframe(df: pd.DataFrame, df_name: str) -> None: + """ + Check if the provided dataframe is empty and raise an exception if it is. + + Args: + df (pd.DataFrame): DataFrame to check + df_name (str): Name of the DataFrame + """ + if df.empty: + msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***" + print(msg) + raise ValueError(msg) + + +def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame: + """ + Expands a column in a DataFrame containing a list of probabilities into separate columns. + + Args: + df (pd.DataFrame): DataFrame containing a "pred_proba" column + class_labels (List[str]): List of class labels + + Returns: + pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns + """ + + # Sanity check + proba_column = "pred_proba" + if proba_column not in df.columns: + raise ValueError('DataFrame does not contain a "pred_proba" column') + + # Construct new column names with '_proba' suffix + new_col_names = [f"{label}_proba" for label in class_labels] + + # Expand the proba_column into separate columns for each probability + proba_df = pd.DataFrame(df[proba_column].tolist(), columns=new_col_names) + + # Drop the original proba_column and reset the index in prep for the concat + df = df.drop(columns=[proba_column]) + df = df.reset_index(drop=True) + + # Concatenate the new columns with the original DataFrame + df = pd.concat([df, proba_df], axis=1) + print(df) + return df + + +def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame: + """ + Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive). + Prioritizes exact case matches first, then falls back to case-insensitive matching if no exact match exists. + + Args: + df (pd.DataFrame): The DataFrame with the original columns. + model_features (list): The desired list of feature names (mixed case allowed). + + Returns: + pd.DataFrame: The DataFrame with renamed columns to match the model's feature names. + """ + # Create a mapping for exact and case-insensitive matching + exact_match_set = set(df.columns) + column_map = {} + + # Build the case-insensitive map (if we have any duplicate columns, the first one wins) + for col in df.columns: + lower_col = col.lower() + if lower_col not in column_map: + column_map[lower_col] = col + + # Create a dictionary for renaming + rename_dict = {} + for feature in model_features: + # Check for an exact match first + if feature in exact_match_set: + rename_dict[feature] = feature + + # If not an exact match, fall back to case-insensitive matching + elif feature.lower() in column_map: + rename_dict[column_map[feature.lower()]] = feature + + # Rename the columns in the DataFrame to match the model's feature names + return df.rename(columns=rename_dict) + + +if __name__ == "__main__": + """The main function is for training the XGBoost model""" + + # Harness Template Parameters + target = TEMPLATE_PARAMS["target_column"] + feature_list = TEMPLATE_PARAMS["feature_list"] + model_type = TEMPLATE_PARAMS["model_type"] + model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"] + train_all_data = TEMPLATE_PARAMS["train_all_data"] + validation_split = 0.2 + + # Sagemaker specific arguments. Defaults are set in the environment variables. + parser = argparse.ArgumentParser() + parser.add_argument( + "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"] + ) + parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + args = parser.parse_args() + + # Read the training data into DataFrames + training_files = [ + os.path.join(args.train, file) + for file in os.listdir(args.train) + if file.endswith(".csv") + ] + print(f"Training Files: {training_files}") + + # Combine files and read them all into a single pandas dataframe + all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files]) + + # Check if the dataframe is empty + check_dataframe(all_df, "training_df") + + # Features/Target output + print(f"Target: {target}") + print(f"Features: {str(feature_list)}") + + # Do we want to train on all the data? + if train_all_data: + print("Training on ALL of the data") + df_train = all_df.copy() + df_val = all_df.copy() + + # Does the dataframe have a training column? + elif "training" in all_df.columns: + print("Found training column, splitting data based on training column") + df_train = all_df[all_df["training"]].copy() + df_val = all_df[~all_df["training"]].copy() + else: + # Just do a random training Split + print("WARNING: No training column found, splitting data with random state=42") + df_train, df_val = train_test_split( + all_df, test_size=validation_split, random_state=42 + ) + print(f"FIT/TRAIN: {df_train.shape}") + print(f"VALIDATION: {df_val.shape}") + + # Now spin up our XGB Model + if model_type == "classifier": + xgb_model = xgb.XGBClassifier() + + # Encode the target column + label_encoder = LabelEncoder() + df_train[target] = label_encoder.fit_transform(df_train[target]) + df_val[target] = label_encoder.transform(df_val[target]) + + else: + xgb_model = xgb.XGBRegressor() + label_encoder = None # We don't need this for regression + + # Grab our Features, Target and Train the Model + y = df_train[target] + X = df_train[feature_list] + xgb_model.fit(X, y) + + # Make Predictions on the Validation Set + print(f"Making Predictions on Validation Set...") + preds = xgb_model.predict(df_val[feature_list]) + if model_type == "classifier": + # Also get the probabilities for each class + print("Processing Probabilities...") + probs = xgb_model.predict_proba(df_val[feature_list]) + df_val["pred_proba"] = [p.tolist() for p in probs] + + # Expand the pred_proba column into separate columns for each class + print(df_val.columns) + df_val = expand_proba_column(df_val, label_encoder.classes_) + print(df_val.columns) + + # Decode the target and prediction labels + df_val[target] = label_encoder.inverse_transform(df_val[target]) + preds = label_encoder.inverse_transform(preds) + + # Save predictions to S3 (just the target, prediction, and '_proba' columns) + # Note: Skipping this for our test script + """ + df_val["prediction"] = preds + output_columns = [target, "prediction"] + output_columns += [col for col in df_val.columns if col.endswith("_proba")] + wr.s3.to_csv( + df_val[output_columns], + path=f"{model_metrics_s3_path}/validation_predictions.csv", + index=False, + ) + """ + + # Report Performance Metrics + if model_type == "classifier": + # Get the label names and their integer mapping + label_names = label_encoder.classes_ + + # Calculate various model performance metrics + scores = precision_recall_fscore_support( + df_val[target], preds, average=None, labels=label_names + ) + + # Put the scores into a dataframe + score_df = pd.DataFrame( + { + target: label_names, + "precision": scores[0], + "recall": scores[1], + "fscore": scores[2], + "support": scores[3], + } + ) + + # We need to get creative with the Classification Metrics + metrics = ["precision", "recall", "fscore", "support"] + for t in label_names: + for m in metrics: + value = score_df.loc[score_df[target] == t, m].iloc[0] + print(f"Metrics:{t}:{m} {value}") + + # Compute and output the confusion matrix + conf_mtx = confusion_matrix(df_val[target], preds, labels=label_names) + for i, row_name in enumerate(label_names): + for j, col_name in enumerate(label_names): + value = conf_mtx[i, j] + print(f"ConfusionMatrix:{row_name}:{col_name} {value}") + + else: + # Calculate various model performance metrics (regression) + rmse = root_mean_squared_error(df_val[target], preds) + mae = mean_absolute_error(df_val[target], preds) + r2 = r2_score(df_val[target], preds) + print(f"RMSE: {rmse:.3f}") + print(f"MAE: {mae:.3f}") + print(f"R2: {r2:.3f}") + print(f"NumRows: {len(df_val)}") + + # Now save the model to the standard place/name + xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json")) + if label_encoder: + joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib")) + + # Also save the features (this will validate input during predictions) + with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp: + json.dump(feature_list, fp) + + +def model_fn(model_dir): + """Deserialized and return fitted model""" + + # Load our XGBoost model from the model directory + model_path = os.path.join(model_dir, "xgb_model.json") + with open(model_path, "r") as f: + model_json = json.load(f) + saved_model_type = json.loads(model_json.get('learner').get('attributes').get('scikit_learn')).get('_estimator_type') + if saved_model_type == "classifier": + model = xgb.XGBClassifier() + elif saved_model_type == "regressor": + model = xgb.XGBRegressor() + else: + msg = f"Model type ({saved_model_type}) not recognized. Expected 'classifier' or 'regressor'" + raise ValueError(msg) + + model.load_model(model_path) + return model + + +def input_fn(input_data, content_type): + """Parse input data and return a DataFrame.""" + if not input_data: + raise ValueError("Empty input data is not supported!") + + # Decode bytes to string if necessary + if isinstance(input_data, bytes): + input_data = input_data.decode("utf-8") + + if "text/csv" in content_type: + return pd.read_csv(StringIO(input_data)) + elif "application/json" in content_type: + return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records + else: + raise ValueError(f"{content_type} not supported!") + + +def output_fn(output_df, accept_type): + """Supports both CSV and JSON output formats.""" + if "text/csv" in accept_type: + csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values + return csv_output, "text/csv" + elif "application/json" in accept_type: + return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null) + else: + raise RuntimeError(f"{accept_type} accept type is not supported by this script.") + + +def predict_fn(df, model) -> pd.DataFrame: + """Make Predictions with our XGB Model + + Args: + df (pd.DataFrame): The input DataFrame + model: The model use for predictions + + Returns: + pd.DataFrame: The DataFrame with the predictions added + """ + + # Grab our feature columns (from training) + model_dir = os.environ["SM_MODEL_DIR"] + with open(os.path.join(model_dir, "feature_columns.json")) as fp: + model_features = json.load(fp) + print(f"Model Features: {model_features}") + + # Load our Label Encoder if we have one + label_encoder = None + if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")): + label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib")) + + # We're going match features in a case-insensitive manner, accounting for all the permutations + # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos") + # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos") + matched_df = match_features_case_insensitive(df, model_features) + + # Predict the features against our XGB Model + predictions = model.predict(matched_df[model_features]) + + # If we have a label encoder, decode the predictions + if label_encoder: + predictions = label_encoder.inverse_transform(predictions) + + # Set the predictions on the DataFrame + df["prediction"] = predictions + + # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame + if getattr(model, "predict_proba", None): + probs = model.predict_proba(matched_df[model_features]) + df["pred_proba"] = [p.tolist() for p in probs] + + # Expand the pred_proba column into separate columns for each class + df = expand_proba_column(df, label_encoder.classes_) + + # All done, return the DataFrame with new columns for the predictions + return df diff --git a/model_docker_images/tests/run_tests.sh b/model_docker_images/tests/run_tests.sh new file mode 100644 index 000000000..65cbd1514 --- /dev/null +++ b/model_docker_images/tests/run_tests.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +# Get the project root directory +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# Configuration +TRAINING_IMAGE="aws_model_training" +INFERENCE_IMAGE="aws_model_inference" +IMAGE_VERSION=${1:-"0.1"} + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Test training container +echo "======================================" +echo "๐Ÿงช Testing training container" +echo "======================================" +python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}" + +# Test inference container +echo "======================================" +echo "๐Ÿงช Testing inference container" +echo "======================================" + +# Start the inference container in the background +echo "Starting inference container..." +CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}") + +# Wait for the container to initialize +echo "Waiting for server to initialize (5 seconds)..." +sleep 5 + +# Run the test +python "$SCRIPT_DIR/test_inference.py" + +# Stop and remove the container +echo "Stopping inference container..." +docker stop $CONTAINER_ID +docker rm $CONTAINER_ID + +echo "======================================" +echo -e "${GREEN}โœ… Testing completed!${NC}" +echo "======================================" \ No newline at end of file diff --git a/model_docker_images/scripts/test_inference.py b/model_docker_images/tests/test_inference.py similarity index 100% rename from model_docker_images/scripts/test_inference.py rename to model_docker_images/tests/test_inference.py diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py new file mode 100644 index 000000000..f451439d7 --- /dev/null +++ b/model_docker_images/tests/test_training.py @@ -0,0 +1,167 @@ +import os +import json +import shutil +import argparse +import subprocess +import tempfile +import time +from pathlib import Path + + +def setup_sagemaker_directories(): + """Create a temporary directory structure that mimics SageMaker's layout.""" + base_dir = tempfile.mkdtemp(prefix="sagemaker-test-") + + # Create the SageMaker directory structure + os.makedirs(f"{base_dir}/input/data/train", exist_ok=True) + os.makedirs(f"{base_dir}/input/config", exist_ok=True) + os.makedirs(f"{base_dir}/model", exist_ok=True) + os.makedirs(f"{base_dir}/output/data", exist_ok=True) + os.makedirs(f"{base_dir}/code", exist_ok=True) + + return base_dir + + +def copy_sample_data(base_dir, data_file): + """Copy sample data to the training directory.""" + if not os.path.exists(data_file): + raise FileNotFoundError(f"Sample data file not found: {data_file}") + + shutil.copy2(data_file, f"{base_dir}/input/data/train/") + print(f"Copied sample data: {data_file} to {base_dir}/input/data/train/") + + +def copy_model_script(base_dir, script_file): + """Copy the model script to the code directory.""" + if not os.path.exists(script_file): + raise FileNotFoundError(f"Model script not found: {script_file}") + + shutil.copy2(script_file, f"{base_dir}/code/") + print(f"Copied model script: {script_file} to {base_dir}/code/") + + return os.path.basename(script_file) + + +def create_hyperparameters(base_dir, script_name, hyperparams=None): + """Create a hyperparameters.json file with SageMaker-specific entries.""" + if hyperparams is None: + hyperparams = {} + + # Add required SageMaker hyperparameters + hyperparams["sagemaker_program"] = script_name + hyperparams["sagemaker_submit_directory"] = "/opt/ml/code" + + # Write the hyperparameters to a JSON file + with open(f"{base_dir}/input/config/hyperparameters.json", "w") as f: + json.dump(hyperparams, f) + + print(f"Created hyperparameters.json with script: {script_name}") + + +def run_training_container(base_dir, image_name, script_name): + """Run the training container with the proper volume mounts and environment variables.""" + # Build the Docker command + cmd = [ + "docker", "run", "--rm", + "-v", f"{base_dir}/input:/opt/ml/input", + "-v", f"{base_dir}/model:/opt/ml/model", + "-v", f"{base_dir}/output:/opt/ml/output", + "-v", f"{base_dir}/code:/opt/ml/code", + "-e", f"SAGEMAKER_PROGRAM={script_name}", + "-e", "SM_MODEL_DIR=/opt/ml/model", + "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", + "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train", + image_name + ] + + print(f"Running training container with command: {' '.join(cmd)}") + + start_time = time.time() + try: + subprocess.run(cmd, check=True) + end_time = time.time() + print(f"Training completed in {end_time - start_time:.2f} seconds") + return True + except subprocess.CalledProcessError as e: + print(f"Error running training container: {e}") + return False + + +def check_training_output(base_dir): + """Check if the training produced the expected output files.""" + model_dir = f"{base_dir}/model" + output_dir = f"{base_dir}/output" + + # Check if model files were created + model_files = os.listdir(model_dir) + print(f"Files in model directory: {model_files}") + + # Check for xgb_model.json which should be created by our example script + if "xgb_model.json" in model_files and "feature_columns.json" in model_files: + print("โœ… Training successful! Model files were created.") + return True + else: + print("โŒ Training failed! Expected model files were not created.") + return False + + +def main(): + parser = argparse.ArgumentParser(description="Test SageMaker training container") + parser.add_argument("--image", type=str, required=True, help="Training image name:tag") + parser.add_argument("--script", type=str, default="example_model_script.py", + help="Path to the model script to test") + parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", + help="Path to sample data file") + args = parser.parse_args() + + # Resolve paths relative to script location + script_dir = Path(__file__).parent.absolute() + project_root = script_dir.parent + + if not os.path.isabs(args.script): + args.script = os.path.join(script_dir, args.script) + + if not os.path.isabs(args.data): + args.data = os.path.join(project_root, args.data) + + try: + # Setup the SageMaker-like directory structure + base_dir = setup_sagemaker_directories() + print(f"Created SageMaker test environment at: {base_dir}") + + # Copy the sample data + copy_sample_data(base_dir, args.data) + + # Copy the model script and get its basename + script_name = copy_model_script(base_dir, args.script) + + # Create hyperparameters.json + # You could add more hyperparameters here specific to your model + hyperparams = { + "model_type": "regressor", + "target_column": "rings", + "feature_list": '["length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight"]', + "train_all_data": "False" + } + create_hyperparameters(base_dir, script_name, hyperparams) + + # Run the training container + success = run_training_container(base_dir, args.image, script_name) + + if success: + # Check if training produced expected output + check_training_output(base_dir) + + # Cleanup + print(f"Temporary files are in: {base_dir}") + print("Not removing temporary files for debugging purposes.") + # If you want to auto-cleanup, uncomment the following line: + # shutil.rmtree(base_dir) + + except Exception as e: + print(f"Error during test: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile index 74f6caf44..02f1c96cd 100644 --- a/model_docker_images/training/Dockerfile +++ b/model_docker_images/training/Dockerfile @@ -1,5 +1,8 @@ FROM python:3.12-slim +# Install Vim +RUN apt-get update && apt-get install -y vim + # Copy requirements file COPY requirements.txt /tmp/ diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index 50f3acc4f..671e32319 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import os import sys import json @@ -99,11 +100,20 @@ def main(): # Get source directory from hyperparameters if 'sagemaker_submit_directory' in hyperparameters: - s3_source = hyperparameters['sagemaker_submit_directory'].strip('"\'') - logger.info(f"Downloading source from: {s3_source}") - - # Download and extract source code - submit_dir = download_and_extract_s3(s3_source) + submit_dir_value = hyperparameters['sagemaker_submit_directory'].strip('"\'') + logger.info(f"Source directory: {submit_dir_value}") + + # Check if it's an S3 URI or a local path + if submit_dir_value.startswith('s3://'): + logger.info(f"Downloading source from S3: {submit_dir_value}") + submit_dir = download_and_extract_s3(submit_dir_value) + else: + logger.info(f"Using local source directory: {submit_dir_value}") + submit_dir = submit_dir_value + # Verify the directory exists + if not os.path.exists(submit_dir): + logger.error(f"Local directory not found: {submit_dir}") + sys.exit(1) # Install requirements install_requirements(os.path.join(submit_dir, "requirements.txt")) diff --git a/model_docker_images/training/test_container.sh b/model_docker_images/training/test_container.sh deleted file mode 100755 index cdc1382b4..000000000 --- a/model_docker_images/training/test_container.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -e - -# Determine script and project directories -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -SCRIPTS_DIR="$PROJECT_ROOT/scripts" - -# Default image name with latest tag -DEFAULT_IMAGE="aws_model_training:0.1" -IMAGE_NAME=${1:-$DEFAULT_IMAGE} - -echo "๐Ÿ“‹ Training Container Test Script" -echo "======================================" - -# Make sure test_training.py exists -if [ ! -f "$SCRIPTS_DIR/test_training.py" ]; then - echo "โŒ Error: test_training.py not found in $SCRIPTS_DIR" - exit 1 -fi - -echo "๐Ÿš€ Testing Training Container: $IMAGE_NAME" -python "$SCRIPTS_DIR/test_training.py" --image "$IMAGE_NAME" - -echo "======================================" \ No newline at end of file diff --git a/model_docker_images/training/train.py b/model_docker_images/training/train.py deleted file mode 100644 index d88c2482c..000000000 --- a/model_docker_images/training/train.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import json -import sys -import traceback -import pandas as pd -from sklearn.model_selection import train_test_split -import xgboost as xgb -import joblib - -# SageMaker paths -prefix = '/opt/ml/' -input_path = prefix + 'input/data' -model_path = os.path.join(prefix, 'model') -param_path = os.path.join(prefix, 'input/config/hyperparameters.json') -output_path = os.path.join(prefix, 'output') - -# Channel names for training and validation data -training_channel_name = 'train' -eval_channel_name = 'validation' - - -# Load hyperparameters -def load_hyperparameters(): - with open(param_path, 'r') as tc: - hyperparameters = json.load(tc) - - # Convert hyperparameters from strings to appropriate types - processed_params = {} - for key, value in hyperparameters.items(): - # Try to convert to int, float, or bool as appropriate - try: - # Convert to int if it looks like an int - if value.isdigit() or (value.startswith('-') and value[1:].isdigit()): - processed_params[key] = int(value) - # Convert to float if it has a decimal point - elif '.' in value: - try: - processed_params[key] = float(value) - except ValueError: - processed_params[key] = value - # Handle boolean values - elif value.lower() in ['true', 'false']: - processed_params[key] = value.lower() == 'true' - else: - processed_params[key] = value - except (AttributeError, ValueError): - # If conversion fails, keep as string - processed_params[key] = value - - return processed_params - - -# Load training data -def load_data(): - train_path = os.path.join(input_path, training_channel_name) - - # Get all CSV files in training directory - train_files = [os.path.join(train_path, file) for file in os.listdir(train_path) - if file.endswith('.csv')] - - if not train_files: - raise ValueError(f"No CSV files found in {train_path}") - - # Read and concatenate all training files - dfs = [] - for file in train_files: - df = pd.read_csv(file) - dfs.append(df) - - if not dfs: - raise ValueError("No valid data found in training files") - - return pd.concat(dfs, ignore_index=True) - - -# Train the model -def train(): - print("Starting the training process") - - try: - # Load hyperparameters - hyperparameters = load_hyperparameters() - print(f"Loaded hyperparameters: {hyperparameters}") - - # Load training data - train_data = load_data() - print(f"Loaded training data with shape: {train_data.shape}") - - # Extract features and target - # Assumes last column is the target - X = train_data.iloc[:, :-1] - y = train_data.iloc[:, -1] - - # Train/test split - X_train, X_val, y_train, y_val = train_test_split( - X, y, test_size=0.2, random_state=42 - ) - - # Configure model parameters from hyperparameters or use defaults - max_depth = hyperparameters.get('max_depth', 6) - learning_rate = hyperparameters.get('learning_rate', 0.1) - n_estimators = hyperparameters.get('n_estimators', 100) - - # Create and train model with a simpler approach - # Removed early stopping and eval_set to ensure compatibility - model = xgb.XGBRegressor( - max_depth=max_depth, - learning_rate=learning_rate, - n_estimators=n_estimators - ) - - print("Training model...") - model.fit(X_train, y_train) - - # Evaluate on validation set - val_score = model.score(X_val, y_val) - print(f"Validation Rยฒ score: {val_score:.4f}") - - # Save the model - os.makedirs(model_path, exist_ok=True) - model_file = os.path.join(model_path, 'model.joblib') - - # Save additional metadata about the model - feature_names = X.columns.tolist() - model_metadata = { - 'feature_names': feature_names, - 'hyperparameters': hyperparameters, - 'validation_score': val_score - } - metadata_file = os.path.join(model_path, 'metadata.json') - - print(f"Saving model to {model_file}") - joblib.dump(model, model_file) - - print(f"Saving metadata to {metadata_file}") - with open(metadata_file, 'w') as f: - json.dump(model_metadata, f) - - print("Training completed successfully") - - except Exception as e: - # Write out an error file - trc = traceback.format_exc() - with open(os.path.join(output_path, 'failure'), 'w') as s: - s.write('Exception during training: ' + str(e) + '\n' + trc) - # Printing this causes the exception to be in the training job logs - print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) - # A non-zero exit code causes the training job to be marked as Failed - sys.exit(255) - - -if __name__ == '__main__': - train() \ No newline at end of file From 8874c4348a4eccc211e2ac46ccb1fd98c70a92fe Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:17:18 -0700 Subject: [PATCH 05/35] making a mock_estimator class for testing the training image --- model_docker_images/tests/test_training.py | 353 +++++++++++++-------- 1 file changed, 217 insertions(+), 136 deletions(-) diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py index f451439d7..baa7783d3 100644 --- a/model_docker_images/tests/test_training.py +++ b/model_docker_images/tests/test_training.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import os import json import shutil @@ -8,160 +9,240 @@ from pathlib import Path -def setup_sagemaker_directories(): - """Create a temporary directory structure that mimics SageMaker's layout.""" - base_dir = tempfile.mkdtemp(prefix="sagemaker-test-") - - # Create the SageMaker directory structure - os.makedirs(f"{base_dir}/input/data/train", exist_ok=True) - os.makedirs(f"{base_dir}/input/config", exist_ok=True) - os.makedirs(f"{base_dir}/model", exist_ok=True) - os.makedirs(f"{base_dir}/output/data", exist_ok=True) - os.makedirs(f"{base_dir}/code", exist_ok=True) - - return base_dir - - -def copy_sample_data(base_dir, data_file): - """Copy sample data to the training directory.""" - if not os.path.exists(data_file): - raise FileNotFoundError(f"Sample data file not found: {data_file}") - - shutil.copy2(data_file, f"{base_dir}/input/data/train/") - print(f"Copied sample data: {data_file} to {base_dir}/input/data/train/") - - -def copy_model_script(base_dir, script_file): - """Copy the model script to the code directory.""" - if not os.path.exists(script_file): - raise FileNotFoundError(f"Model script not found: {script_file}") - - shutil.copy2(script_file, f"{base_dir}/code/") - print(f"Copied model script: {script_file} to {base_dir}/code/") - - return os.path.basename(script_file) - - -def create_hyperparameters(base_dir, script_name, hyperparams=None): - """Create a hyperparameters.json file with SageMaker-specific entries.""" - if hyperparams is None: - hyperparams = {} - - # Add required SageMaker hyperparameters - hyperparams["sagemaker_program"] = script_name - hyperparams["sagemaker_submit_directory"] = "/opt/ml/code" - - # Write the hyperparameters to a JSON file - with open(f"{base_dir}/input/config/hyperparameters.json", "w") as f: - json.dump(hyperparams, f) - - print(f"Created hyperparameters.json with script: {script_name}") - - -def run_training_container(base_dir, image_name, script_name): - """Run the training container with the proper volume mounts and environment variables.""" - # Build the Docker command - cmd = [ - "docker", "run", "--rm", - "-v", f"{base_dir}/input:/opt/ml/input", - "-v", f"{base_dir}/model:/opt/ml/model", - "-v", f"{base_dir}/output:/opt/ml/output", - "-v", f"{base_dir}/code:/opt/ml/code", - "-e", f"SAGEMAKER_PROGRAM={script_name}", - "-e", "SM_MODEL_DIR=/opt/ml/model", - "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", - "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train", - image_name - ] - - print(f"Running training container with command: {' '.join(cmd)}") - - start_time = time.time() - try: - subprocess.run(cmd, check=True) - end_time = time.time() - print(f"Training completed in {end_time - start_time:.2f} seconds") - return True - except subprocess.CalledProcessError as e: - print(f"Error running training container: {e}") - return False - - -def check_training_output(base_dir): - """Check if the training produced the expected output files.""" - model_dir = f"{base_dir}/model" - output_dir = f"{base_dir}/output" - - # Check if model files were created - model_files = os.listdir(model_dir) - print(f"Files in model directory: {model_files}") - - # Check for xgb_model.json which should be created by our example script - if "xgb_model.json" in model_files and "feature_columns.json" in model_files: - print("โœ… Training successful! Model files were created.") - return True - else: - print("โŒ Training failed! Expected model files were not created.") - return False +class MockEstimator: + """ + Mock SageMaker Estimator class that simulates the behavior of sagemaker.estimator.Estimator + for local testing purposes. + """ + + def __init__(self, + image_uri, + entry_point=None, + source_dir=None, + hyperparameters=None, + role=None, + instance_type=None, + **kwargs): + """ + Initialize a MockEstimator with the same parameters as a real SageMaker Estimator. + + Args: + image_uri (str): The Docker image URI to use for training + entry_point (str): The name of the training script + source_dir (str): Directory with the training script and any additional files + hyperparameters (dict): Hyperparameters for the training job + role (str): AWS IAM role (not used in mock) + instance_type (str): EC2 instance type (not used in mock) + **kwargs: Additional arguments + """ + self.image_uri = image_uri + self.entry_point = entry_point + self.source_dir = source_dir + self.hyperparameters = hyperparameters or {} + self.role = role # Not used in mock + self.instance_type = instance_type # Not used in mock + self.kwargs = kwargs + self.temp_dir = None + self.model_data = None + + def fit(self, inputs, job_name=None, wait=True, logs=True): + """ + Train the model using the input data. + + Args: + inputs (dict): Dictionary of input data channels + job_name (str): Name for the training job + wait (bool): Whether to wait for the job to complete + logs (bool): Whether to show the logs + + Returns: + self: The estimator itself + """ + print(f"Starting mock training job: {job_name or 'unnamed-job'}") + + try: + # Create SageMaker directory structure + self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-test-") + print(f"Created SageMaker test environment at: {self.temp_dir}") + + # Create the SageMaker directory structure + os.makedirs(f"{self.temp_dir}/input/data/train", exist_ok=True) + os.makedirs(f"{self.temp_dir}/input/config", exist_ok=True) + os.makedirs(f"{self.temp_dir}/model", exist_ok=True) + os.makedirs(f"{self.temp_dir}/output/data", exist_ok=True) + os.makedirs(f"{self.temp_dir}/code", exist_ok=True) + + # Process input channels and copy data + for channel_name, channel_data in inputs.items(): + channel_dir = f"{self.temp_dir}/input/data/{channel_name}" + os.makedirs(channel_dir, exist_ok=True) + + # Assuming channel_data is a local file path for this mock implementation + if os.path.isfile(channel_data): + shutil.copy2(channel_data, channel_dir) + print(f"Copied data file: {channel_data} to {channel_dir}") + elif os.path.isdir(channel_data): + for file in os.listdir(channel_data): + if file.endswith(".csv"): + shutil.copy2(os.path.join(channel_data, file), channel_dir) + print(f"Copied data file: {os.path.join(channel_data, file)} to {channel_dir}") + + # Copy source files to code directory + if self.source_dir and os.path.exists(self.source_dir): + # Copy all Python files from source_dir + for file in os.listdir(self.source_dir): + if file.endswith(".py"): + shutil.copy2(os.path.join(self.source_dir, file), f"{self.temp_dir}/code") + print(f"Copied source file: {os.path.join(self.source_dir, file)} to {self.temp_dir}/code") + + # Prepare hyperparameters.json + # The key SageMaker parameters + sagemaker_params = { + "sagemaker_program": self.entry_point, + "sagemaker_submit_directory": "/opt/ml/code" # Container path + } + + # Combine with user hyperparameters + all_hyperparams = {**self.hyperparameters, **sagemaker_params} + + # Write the hyperparameters to a JSON file + with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f: + json.dump(all_hyperparams, f) + + print(f"Created hyperparameters.json with entry point: {self.entry_point}") + + # Build the Docker command + cmd = [ + "docker", "run", "--rm", + "-v", f"{self.temp_dir}/input:/opt/ml/input", + "-v", f"{self.temp_dir}/model:/opt/ml/model", + "-v", f"{self.temp_dir}/output:/opt/ml/output", + "-v", f"{self.temp_dir}/code:/opt/ml/code", + "-e", f"SAGEMAKER_PROGRAM={self.entry_point}", + "-e", "SM_MODEL_DIR=/opt/ml/model", + "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", + "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train" + ] + + # Add platform flag for Mac M1/M2/M3 users + if os.uname().machine == 'arm64': + cmd.insert(2, "--platform") + cmd.insert(3, "linux/amd64") + + # Add the image URI + cmd.append(self.image_uri) + + print(f"Running training container with command: {' '.join(cmd)}") + + # Run the container + start_time = time.time() + try: + if logs: + # Run with output visible + subprocess.run(cmd, check=True) + else: + # Run silently + subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + end_time = time.time() + print(f"Training completed in {end_time - start_time:.2f} seconds") + + # Check the output + self._check_training_output() + + # Set the model data path (like SageMaker would) + self.model_data = f"{self.temp_dir}/model" + + return self + + except subprocess.CalledProcessError as e: + print(f"Error running training container: {e}") + if e.stdout: + print(f"STDOUT: {e.stdout.decode('utf-8')}") + if e.stderr: + print(f"STDERR: {e.stderr.decode('utf-8')}") + raise + + except Exception as e: + print(f"Error during fit: {e}") + raise + + def _check_training_output(self): + """Check if the training produced output files in the model directory.""" + model_dir = f"{self.temp_dir}/model" + model_files = os.listdir(model_dir) + + if not model_files: + print("โŒ Warning: No files found in model directory after training") + else: + print(f"โœ… Found model files: {', '.join(model_files)}") + + def cleanup(self): + """Remove temporary directories.""" + if self.temp_dir and os.path.exists(self.temp_dir): + print(f"Cleaning up temporary directory: {self.temp_dir}") + shutil.rmtree(self.temp_dir) + self.temp_dir = None def main(): + """Run the test using a MockEstimator.""" parser = argparse.ArgumentParser(description="Test SageMaker training container") - parser.add_argument("--image", type=str, required=True, help="Training image name:tag") - parser.add_argument("--script", type=str, default="example_model_script.py", - help="Path to the model script to test") + parser.add_argument("--image", type=str, default="aws_model_training:0.1", help="Training image name:tag") + parser.add_argument("--entry-point", type=str, default="example_model_script.py", + help="Name of the training script") + parser.add_argument("--source-dir", type=str, default="tests/", + help="Directory containing the training script") parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", - help="Path to sample data file") + help="Path to training data file or directory") + parser.add_argument("--cleanup", action="store_true", help="Clean up temporary files after test") args = parser.parse_args() - # Resolve paths relative to script location + # Handle relative paths script_dir = Path(__file__).parent.absolute() project_root = script_dir.parent - if not os.path.isabs(args.script): - args.script = os.path.join(script_dir, args.script) + if not os.path.isabs(args.source_dir): + args.source_dir = os.path.join(project_root, args.source_dir) if not os.path.isabs(args.data): args.data = os.path.join(project_root, args.data) + print(f"Testing with:") + print(f" Image: {args.image}") + print(f" Entry point: {args.entry_point}") + print(f" Source directory: {args.source_dir}") + print(f" Training data: {args.data}") + + # Create the estimator + estimator = MockEstimator( + image_uri=args.image, + entry_point=args.entry_point, + source_dir=args.source_dir, + # Common SageMaker instance type for training + instance_type="ml.m5.large" + ) + try: - # Setup the SageMaker-like directory structure - base_dir = setup_sagemaker_directories() - print(f"Created SageMaker test environment at: {base_dir}") - - # Copy the sample data - copy_sample_data(base_dir, args.data) - - # Copy the model script and get its basename - script_name = copy_model_script(base_dir, args.script) - - # Create hyperparameters.json - # You could add more hyperparameters here specific to your model - hyperparams = { - "model_type": "regressor", - "target_column": "rings", - "feature_list": '["length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight"]', - "train_all_data": "False" - } - create_hyperparameters(base_dir, script_name, hyperparams) - - # Run the training container - success = run_training_container(base_dir, args.image, script_name) - - if success: - # Check if training produced expected output - check_training_output(base_dir) - - # Cleanup - print(f"Temporary files are in: {base_dir}") - print("Not removing temporary files for debugging purposes.") - # If you want to auto-cleanup, uncomment the following line: - # shutil.rmtree(base_dir) + # Run training + estimator.fit( + inputs={"train": args.data}, + job_name="mock-training-job" + ) + print("๐Ÿ“‹ MockEstimator training completed successfully") except Exception as e: - print(f"Error during test: {e}") + print(f"โŒ MockEstimator training failed: {e}") raise + finally: + # Clean up if requested + if args.cleanup: + estimator.cleanup() + else: + print(f"Temporary files are in: {estimator.temp_dir}") + print("Not removing temporary files for debugging purposes.") + if __name__ == "__main__": - main() + main() \ No newline at end of file From 115d86a6c958df1441a7cdd110855ffc9ba26784 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:21:36 -0700 Subject: [PATCH 06/35] just some cleanup --- model_docker_images/tests/test_training.py | 25 ++++++---------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py index baa7783d3..88d6ac3af 100644 --- a/model_docker_images/tests/test_training.py +++ b/model_docker_images/tests/test_training.py @@ -121,17 +121,10 @@ def fit(self, inputs, job_name=None, wait=True, logs=True): "-e", f"SAGEMAKER_PROGRAM={self.entry_point}", "-e", "SM_MODEL_DIR=/opt/ml/model", "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", - "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train" + "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train", + self.image_uri ] - # Add platform flag for Mac M1/M2/M3 users - if os.uname().machine == 'arm64': - cmd.insert(2, "--platform") - cmd.insert(3, "linux/amd64") - - # Add the image URI - cmd.append(self.image_uri) - print(f"Running training container with command: {' '.join(cmd)}") # Run the container @@ -195,7 +188,7 @@ def main(): help="Directory containing the training script") parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Path to training data file or directory") - parser.add_argument("--cleanup", action="store_true", help="Clean up temporary files after test") + # Removed cleanup argument since we always clean up args = parser.parse_args() # Handle relative paths @@ -219,7 +212,6 @@ def main(): image_uri=args.image, entry_point=args.entry_point, source_dir=args.source_dir, - # Common SageMaker instance type for training instance_type="ml.m5.large" ) @@ -236,13 +228,10 @@ def main(): raise finally: - # Clean up if requested - if args.cleanup: - estimator.cleanup() - else: - print(f"Temporary files are in: {estimator.temp_dir}") - print("Not removing temporary files for debugging purposes.") + # Always clean up temporary files + estimator.cleanup() + print("Temporary files have been cleaned up.") if __name__ == "__main__": - main() \ No newline at end of file + main() From 72dfacf818b1c44a4b110cb0fead603344a4572c Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:22:34 -0700 Subject: [PATCH 07/35] cleanup and simplification --- model_docker_images/tests/test_training.py | 199 +++++++-------------- 1 file changed, 60 insertions(+), 139 deletions(-) diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py index 88d6ac3af..c7f3b8593 100644 --- a/model_docker_images/tests/test_training.py +++ b/model_docker_images/tests/test_training.py @@ -10,108 +10,60 @@ class MockEstimator: - """ - Mock SageMaker Estimator class that simulates the behavior of sagemaker.estimator.Estimator - for local testing purposes. - """ - - def __init__(self, - image_uri, - entry_point=None, - source_dir=None, - hyperparameters=None, - role=None, - instance_type=None, - **kwargs): - """ - Initialize a MockEstimator with the same parameters as a real SageMaker Estimator. - - Args: - image_uri (str): The Docker image URI to use for training - entry_point (str): The name of the training script - source_dir (str): Directory with the training script and any additional files - hyperparameters (dict): Hyperparameters for the training job - role (str): AWS IAM role (not used in mock) - instance_type (str): EC2 instance type (not used in mock) - **kwargs: Additional arguments - """ + """Mock SageMaker Estimator for local container testing""" + + def __init__(self, image_uri, entry_point=None, source_dir=None, hyperparameters=None, **kwargs): self.image_uri = image_uri self.entry_point = entry_point self.source_dir = source_dir self.hyperparameters = hyperparameters or {} - self.role = role # Not used in mock - self.instance_type = instance_type # Not used in mock - self.kwargs = kwargs self.temp_dir = None self.model_data = None - def fit(self, inputs, job_name=None, wait=True, logs=True): - """ - Train the model using the input data. - - Args: - inputs (dict): Dictionary of input data channels - job_name (str): Name for the training job - wait (bool): Whether to wait for the job to complete - logs (bool): Whether to show the logs - - Returns: - self: The estimator itself - """ + def fit(self, inputs, job_name=None, logs=True): + """Train the model using the input data""" print(f"Starting mock training job: {job_name or 'unnamed-job'}") try: - # Create SageMaker directory structure + # Set up SageMaker directory structure self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-test-") - print(f"Created SageMaker test environment at: {self.temp_dir}") + print(f"Created test environment at: {self.temp_dir}") - # Create the SageMaker directory structure - os.makedirs(f"{self.temp_dir}/input/data/train", exist_ok=True) - os.makedirs(f"{self.temp_dir}/input/config", exist_ok=True) - os.makedirs(f"{self.temp_dir}/model", exist_ok=True) - os.makedirs(f"{self.temp_dir}/output/data", exist_ok=True) - os.makedirs(f"{self.temp_dir}/code", exist_ok=True) + # Create directories + for path in ['input/data/train', 'input/config', 'model', 'output/data', 'code']: + os.makedirs(f"{self.temp_dir}/{path}", exist_ok=True) - # Process input channels and copy data + # Copy data files for channel_name, channel_data in inputs.items(): channel_dir = f"{self.temp_dir}/input/data/{channel_name}" os.makedirs(channel_dir, exist_ok=True) - # Assuming channel_data is a local file path for this mock implementation if os.path.isfile(channel_data): shutil.copy2(channel_data, channel_dir) - print(f"Copied data file: {channel_data} to {channel_dir}") + print(f"Copied data: {os.path.basename(channel_data)} to {channel_name} channel") elif os.path.isdir(channel_data): for file in os.listdir(channel_data): if file.endswith(".csv"): shutil.copy2(os.path.join(channel_data, file), channel_dir) - print(f"Copied data file: {os.path.join(channel_data, file)} to {channel_dir}") # Copy source files to code directory if self.source_dir and os.path.exists(self.source_dir): - # Copy all Python files from source_dir for file in os.listdir(self.source_dir): if file.endswith(".py"): shutil.copy2(os.path.join(self.source_dir, file), f"{self.temp_dir}/code") - print(f"Copied source file: {os.path.join(self.source_dir, file)} to {self.temp_dir}/code") + print(f"Copied source files to code directory") - # Prepare hyperparameters.json - # The key SageMaker parameters - sagemaker_params = { + # Create hyperparameters.json + all_hyperparams = { + **self.hyperparameters, "sagemaker_program": self.entry_point, - "sagemaker_submit_directory": "/opt/ml/code" # Container path + "sagemaker_submit_directory": "/opt/ml/code" } - # Combine with user hyperparameters - all_hyperparams = {**self.hyperparameters, **sagemaker_params} - - # Write the hyperparameters to a JSON file with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f: json.dump(all_hyperparams, f) - print(f"Created hyperparameters.json with entry point: {self.entry_point}") - - # Build the Docker command + # Run the container cmd = [ "docker", "run", "--rm", "-v", f"{self.temp_dir}/input:/opt/ml/input", @@ -125,113 +77,82 @@ def fit(self, inputs, job_name=None, wait=True, logs=True): self.image_uri ] - print(f"Running training container with command: {' '.join(cmd)}") - - # Run the container - start_time = time.time() - try: - if logs: - # Run with output visible - subprocess.run(cmd, check=True) - else: - # Run silently - subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - end_time = time.time() - print(f"Training completed in {end_time - start_time:.2f} seconds") + # Add platform flag for Mac M1/M2/M3 users + if os.uname().machine == 'arm64': + cmd.insert(2, "--platform") + cmd.insert(3, "linux/amd64") - # Check the output - self._check_training_output() - - # Set the model data path (like SageMaker would) - self.model_data = f"{self.temp_dir}/model" - - return self - - except subprocess.CalledProcessError as e: - print(f"Error running training container: {e}") - if e.stdout: - print(f"STDOUT: {e.stdout.decode('utf-8')}") - if e.stderr: - print(f"STDERR: {e.stderr.decode('utf-8')}") - raise + print(f"Running training container...") + start_time = time.time() + result = subprocess.run(cmd, check=True, capture_output=not logs) + training_time = time.time() - start_time + print(f"Training completed in {training_time:.2f} seconds") + + # Check output + model_files = os.listdir(f"{self.temp_dir}/model") + if model_files: + print(f"โœ… Model created successfully with files: {', '.join(model_files)}") + else: + print("โš ๏ธ No model files were created during training") + + return self + + except subprocess.CalledProcessError as e: + print(f"โŒ Training failed with exit code {e.returncode}") + if e.stdout: + print(f"STDOUT: {e.stdout.decode('utf-8')}") + if e.stderr: + print(f"STDERR: {e.stderr.decode('utf-8')}") + raise except Exception as e: - print(f"Error during fit: {e}") + print(f"โŒ Error during training: {e}") raise - def _check_training_output(self): - """Check if the training produced output files in the model directory.""" - model_dir = f"{self.temp_dir}/model" - model_files = os.listdir(model_dir) - - if not model_files: - print("โŒ Warning: No files found in model directory after training") - else: - print(f"โœ… Found model files: {', '.join(model_files)}") - def cleanup(self): - """Remove temporary directories.""" + """Remove temporary directories""" if self.temp_dir and os.path.exists(self.temp_dir): - print(f"Cleaning up temporary directory: {self.temp_dir}") shutil.rmtree(self.temp_dir) self.temp_dir = None def main(): - """Run the test using a MockEstimator.""" + """Run the test using a MockEstimator""" parser = argparse.ArgumentParser(description="Test SageMaker training container") parser.add_argument("--image", type=str, default="aws_model_training:0.1", help="Training image name:tag") - parser.add_argument("--entry-point", type=str, default="example_model_script.py", - help="Name of the training script") - parser.add_argument("--source-dir", type=str, default="tests/", - help="Directory containing the training script") - parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", - help="Path to training data file or directory") - # Removed cleanup argument since we always clean up + parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name") + parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts") + parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path") args = parser.parse_args() - # Handle relative paths + # Resolve relative paths script_dir = Path(__file__).parent.absolute() project_root = script_dir.parent - if not os.path.isabs(args.source_dir): - args.source_dir = os.path.join(project_root, args.source_dir) - - if not os.path.isabs(args.data): - args.data = os.path.join(project_root, args.data) + source_dir = os.path.join(project_root, args.source_dir) if not os.path.isabs(args.source_dir) else args.source_dir + data_path = os.path.join(project_root, args.data) if not os.path.isabs(args.data) else args.data - print(f"Testing with:") - print(f" Image: {args.image}") - print(f" Entry point: {args.entry_point}") - print(f" Source directory: {args.source_dir}") - print(f" Training data: {args.data}") + print(f"Testing with image {args.image}, script {args.entry_point}") - # Create the estimator + # Create and run the estimator estimator = MockEstimator( image_uri=args.image, entry_point=args.entry_point, - source_dir=args.source_dir, - instance_type="ml.m5.large" + source_dir=source_dir ) try: - # Run training estimator.fit( - inputs={"train": args.data}, + inputs={"train": data_path}, job_name="mock-training-job" ) - print("๐Ÿ“‹ MockEstimator training completed successfully") - + print("โœ… Training completed successfully") except Exception as e: - print(f"โŒ MockEstimator training failed: {e}") + print(f"โŒ Training failed: {e}") raise - finally: - # Always clean up temporary files estimator.cleanup() - print("Temporary files have been cleaned up.") if __name__ == "__main__": - main() + main() \ No newline at end of file From 99a6630ec12b9f4f2826db6e923571adfb8b0550 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:25:48 -0700 Subject: [PATCH 08/35] cleanup and simplification --- .../training/sagemaker_entrypoint.py | 100 +++++++----------- 1 file changed, 36 insertions(+), 64 deletions(-) diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index 671e32319..807a82ee3 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os import sys import json @@ -9,8 +9,10 @@ from urllib.parse import urlparse # Set up logging -logging.basicConfig(level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) logger = logging.getLogger('sagemaker-entry-point') @@ -25,16 +27,15 @@ def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"): try: s3 = boto3.client("s3") s3.download_file(bucket, key, local_tar) - logger.info(f"Download successful, tar file size: {os.path.getsize(local_tar)} bytes") + logger.info(f"Download successful: {os.path.getsize(local_tar)} bytes") os.makedirs(target_dir, exist_ok=True) with tarfile.open(local_tar, "r:gz") as tar: tar.extractall(path=target_dir) - logger.info(f"Files in {target_dir} after extraction: {os.listdir(target_dir)}") return target_dir except Exception as e: - logger.error(f"Error downloading from S3: {str(e)}") + logger.error(f"Error downloading from S3: {e}") sys.exit(1) @@ -46,16 +47,16 @@ def install_requirements(requirements_path): subprocess.check_call([ sys.executable, "-m", "pip", "install", "-r", requirements_path ]) - logger.info("Requirements installation completed successfully.") + logger.info("Requirements installed successfully.") except subprocess.CalledProcessError as e: - logger.error(f"Error installing requirements: {str(e)}") + logger.error(f"Error installing requirements: {e}") sys.exit(1) else: logger.info(f"No requirements file found at {requirements_path}") -def setup_sagemaker_environment(): - """Set up SageMaker environment variables based on /opt/ml structure.""" +def setup_environment(): + """Set up SageMaker environment variables.""" env_vars = { "SM_MODEL_DIR": "/opt/ml/model", "SM_OUTPUT_DATA_DIR": "/opt/ml/output/data", @@ -65,92 +66,66 @@ def setup_sagemaker_environment(): "SM_INPUT_CONFIG_DIR": "/opt/ml/input/config" } - # Set the environment variables for key, value in env_vars.items(): os.environ[key] = str(value) + os.makedirs(value, exist_ok=True) - logger.info(f"Set SageMaker environment variables: {list(env_vars.keys())}") + logger.info(f"SageMaker environment initialized.") def main(): logger.info("Starting SageMaker container entry point") - # Read hyperparameters - hyperparameters_path = '/opt/ml/input/config/hyperparameters.json' - if not os.path.exists(hyperparameters_path): - logger.error("Error: hyperparameters.json not found!") + # Load hyperparameters + hyperparams_path = '/opt/ml/input/config/hyperparameters.json' + if not os.path.exists(hyperparams_path): + logger.error("hyperparameters.json not found!") sys.exit(1) - with open(hyperparameters_path, 'r') as f: - hyperparameters = json.load(f) - logger.info(f"Hyperparameters: {hyperparameters}") + with open(hyperparams_path, 'r') as f: + hyperparams = json.load(f) - # Set up environment based on hyperparameters - # Get program name from hyperparameters or environment variable - if 'sagemaker_program' in hyperparameters: - program = hyperparameters['sagemaker_program'].strip('"\'') + # Get program name from hyperparameters or environment + if 'sagemaker_program' in hyperparams: + program = hyperparams['sagemaker_program'].strip('"\'') os.environ['SAGEMAKER_PROGRAM'] = program elif 'SAGEMAKER_PROGRAM' in os.environ: program = os.environ['SAGEMAKER_PROGRAM'] else: - logger.error("Error: sagemaker_program not found in hyperparameters or environment!") + logger.error("sagemaker_program not found in hyperparameters or environment!") sys.exit(1) logger.info(f"Using program: {program}") - # Get source directory from hyperparameters - if 'sagemaker_submit_directory' in hyperparameters: - submit_dir_value = hyperparameters['sagemaker_submit_directory'].strip('"\'') - logger.info(f"Source directory: {submit_dir_value}") + # Get source directory + submit_dir = "/opt/ml/code" + if 'sagemaker_submit_directory' in hyperparams: + submit_dir_value = hyperparams['sagemaker_submit_directory'].strip('"\'') - # Check if it's an S3 URI or a local path + # Handle S3 vs local path if submit_dir_value.startswith('s3://'): - logger.info(f"Downloading source from S3: {submit_dir_value}") submit_dir = download_and_extract_s3(submit_dir_value) else: - logger.info(f"Using local source directory: {submit_dir_value}") submit_dir = submit_dir_value - # Verify the directory exists if not os.path.exists(submit_dir): logger.error(f"Local directory not found: {submit_dir}") sys.exit(1) - # Install requirements - install_requirements(os.path.join(submit_dir, "requirements.txt")) - else: - logger.info("No sagemaker_submit_directory specified, assuming code is already in /opt/ml/code") - submit_dir = "/opt/ml/code" - - # Check if directory exists - if not os.path.exists(submit_dir): - logger.error(f"Code directory {submit_dir} does not exist!") - sys.exit(1) + # Install requirements if present + install_requirements(os.path.join(submit_dir, "requirements.txt")) - # List code directory contents for debugging - logger.info(f"Contents of {submit_dir}:") - try: - output = subprocess.check_output(['ls', '-la', submit_dir]) - logger.info(output.decode('utf-8')) - except Exception as e: - logger.error(f"Failed to list directory: {e}") + # Set up environment variables + setup_environment() - # Set up SageMaker environment variables - setup_sagemaker_environment() - - # Ensure directories exist - os.makedirs(os.environ["SM_MODEL_DIR"], exist_ok=True) - os.makedirs(os.environ["SM_OUTPUT_DATA_DIR"], exist_ok=True) - - # Locate entry point script + # Find entry point script entry_point = os.path.join(submit_dir, program) if not os.path.exists(entry_point): - logger.error(f"Error: Entry point '{entry_point}' not found!") + logger.error(f"Entry point not found: {entry_point}") sys.exit(1) - logger.info(f"Running entry point: {entry_point}") - sys.stdout.flush() + logger.info(f"Executing: {program}") - # Execute with proper arguments + # Execute the training script with SageMaker arguments cmd = [ sys.executable, entry_point, "--model-dir", os.environ["SM_MODEL_DIR"], @@ -158,9 +133,6 @@ def main(): "--train", os.environ["SM_CHANNEL_TRAIN"] ] - logger.info(f"Executing: {' '.join(cmd)}") - - # Replace current process with the entry point script and arguments try: os.execv(sys.executable, cmd) except Exception as e: From 0a060669619431437a0ce98c993a13e3e3b2290b Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:37:14 -0700 Subject: [PATCH 09/35] removing some old test code --- .../inference/test_container.sh | 50 ------------------- 1 file changed, 50 deletions(-) delete mode 100755 model_docker_images/inference/test_container.sh diff --git a/model_docker_images/inference/test_container.sh b/model_docker_images/inference/test_container.sh deleted file mode 100755 index 3157b3df7..000000000 --- a/model_docker_images/inference/test_container.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -e - -# Determine script and project directories -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -SCRIPTS_DIR="$PROJECT_ROOT/scripts" - -# Default image name -DEFAULT_IMAGE="aws_model_inference:0.1" -IMAGE_NAME=${1:-$DEFAULT_IMAGE} - -# Port to use for testing -PORT=8080 - -echo "๐Ÿ“‹ Inference Container Test Script" -echo "======================================" - -# Make sure test script exists -if [ ! -f "$SCRIPTS_DIR/test_inference.py" ]; then - echo "โŒ Error: test_inference.py not found in $SCRIPTS_DIR" - exit 1 -fi - -# Start the inference container with proper log settings -echo "๐Ÿš€ Starting inference container: $IMAGE_NAME" -CONTAINER_ID=$(docker run -d -p $PORT:$PORT -e PYTHONUNBUFFERED=1 "$IMAGE_NAME") - -# Follow logs in the background -docker logs -f $CONTAINER_ID & -LOGS_PID=$! - -# Ensure container and log process are stopped on script exit -function cleanup { - echo "๐Ÿงน Stopping log process and container..." - kill $LOGS_PID 2>/dev/null || true - docker stop $CONTAINER_ID >/dev/null 2>&1 - docker rm $CONTAINER_ID >/dev/null 2>&1 -} -trap cleanup EXIT - -# Wait for container to initialize -echo "โณ Waiting for server to initialize (5 seconds)..." -sleep 5 - -# Run the test -echo "๐Ÿงช Testing inference container..." -python "$SCRIPTS_DIR/test_inference.py" --host localhost --port $PORT - -echo "======================================" \ No newline at end of file From 84d1db8289d1080086623c4d2c1ea7f730c4886f Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 12:47:24 -0700 Subject: [PATCH 10/35] refactoring inference entry_point and test harness --- model_docker_images/inference/main.py | 109 +++--- model_docker_images/tests/test_inference.py | 361 ++++++++++++++++---- 2 files changed, 334 insertions(+), 136 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index 75b369396..0430d93d6 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -2,7 +2,6 @@ from contextlib import asynccontextmanager import os import json -import numpy as np import pandas as pd import joblib import logging @@ -11,69 +10,55 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -# Model will be accessible globally +# Global variables for model and metadata model = None model_metadata = None @asynccontextmanager async def lifespan(app: FastAPI): - # Load model on startup + """Handle model loading on startup and cleanup on shutdown.""" global model, model_metadata - - # SageMaker model path model_path = os.environ.get('MODEL_PATH', '/opt/ml/model') + model_file = os.path.join(model_path, 'model.joblib') try: logger.info(f"Loading model from {model_path}") - model_file = os.path.join(model_path, 'model.joblib') # Check if model file exists - if not os.path.exists(model_file): - logger.warning(f"Model file not found at {model_file}") - # List directory contents for debugging + if os.path.exists(model_file): + model = joblib.load(model_file) + logger.info(f"Model loaded successfully: {type(model)}") + else: + # Log the error and available files + logger.error(f"Model file not found at {model_file}") if os.path.exists(model_path): - logger.info(f"Contents of {model_path}: {os.listdir(model_path)}") + logger.error(f"Contents of {model_path}: {os.listdir(model_path)}") else: - logger.warning(f"Model directory {model_path} not found") + logger.error(f"Model directory {model_path} does not exist") - # For testing only - create a dummy model - logger.warning("Creating a dummy model for testing") - import xgboost as xgb - model = xgb.XGBRegressor() - model.fit(np.array([[1, 2, 3]]), np.array([1])) - else: - # Load the actual model - logger.info(f"Loading model from {model_file}") - model = joblib.load(model_file) - logger.info(f"Model loaded successfully: {type(model)}") + # Fail fast - no fallback for production + raise FileNotFoundError(f"Required model file not found: {model_file}") # Load metadata if available - try: - metadata_file = os.path.join(model_path, 'metadata.json') - if os.path.exists(metadata_file): - with open(metadata_file, 'r') as f: - model_metadata = json.load(f) - logger.info(f"Loaded model metadata: {model_metadata}") - else: - logger.warning(f"Metadata file not found at {metadata_file}") - model_metadata = {'feature_names': None} - except Exception as e: - logger.error(f"Error loading model metadata: {e}") + metadata_file = os.path.join(model_path, 'metadata.json') + if os.path.exists(metadata_file): + with open(metadata_file, 'r') as f: + model_metadata = json.load(f) + logger.info(f"Loaded model metadata") + else: + logger.info(f"No metadata found, using default") model_metadata = {'feature_names': None} + except Exception as e: logger.error(f"Error loading model: {e}", exc_info=True) - # Provide a fallback model for testing - import xgboost as xgb - model = xgb.XGBRegressor() - model.fit(np.array([[1, 2, 3]]), np.array([1])) - model_metadata = {'feature_names': None} + # In production, we don't want to create fallback models + # Let the container fail to start + raise logger.info("Model initialization complete") yield - - # Cleanup on shutdown if needed - logger.info("Cleaning up resources") + logger.info("Shutting down model server") app = FastAPI(lifespan=lifespan) @@ -81,7 +66,7 @@ async def lifespan(app: FastAPI): @app.get('/ping') def ping(): - # SageMaker health check - return 200 if model is loaded + """Health check endpoint for SageMaker.""" if model is not None: return Response(status_code=200) return Response(status_code=404) @@ -89,48 +74,34 @@ def ping(): @app.post('/invocations') async def invoke(request: Request): - logger.info("Received inference request") + """Inference endpoint for SageMaker.""" content_type = request.headers.get('Content-Type', '') accept_type = request.headers.get('Accept', '') - logger.info(f"Content-Type: {content_type}, Accept: {accept_type}") - - # Get the data - body = await request.body() - try: - # Handle different content types - if content_type == 'text/csv': - # Parse CSV data + # Get request body + body = await request.body() + + # Parse input data based on content type + if 'text/csv' in content_type: s = body.decode('utf-8') data = pd.read_csv(pd.StringIO(s), header=None) - logger.info(f"Parsed CSV data with shape: {data.shape}") - else: - # Default to JSON + else: # Default to JSON json_str = body.decode('utf-8') - logger.info(f"Raw JSON input: {json_str}") data_json = json.loads(json_str) - logger.info(f"Parsed JSON data: {data_json}") - # Convert to DataFrame if it's not already - if not isinstance(data_json, pd.DataFrame): - data = pd.DataFrame(data_json) - else: - data = data_json + data = pd.DataFrame(data_json) if not isinstance(data_json, pd.DataFrame) else data_json # Make prediction - logger.info(f"Making prediction with data shape: {data.shape}") predictions = model.predict(data) - logger.info(f"Prediction successful, result shape: {len(predictions) if hasattr(predictions, '__len__') else 'scalar'}") - # Always return JSON unless explicitly requested as CSV - if accept_type == 'text/csv': + # Format response based on accept type + if 'text/csv' in accept_type: result = pd.DataFrame(predictions).to_csv(header=False, index=False) - logger.info(f"Returning CSV response: {result}") return Response(content=result, media_type='text/csv') - else: - # Default to JSON for everything else - result = json.dumps({'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions)}) - logger.info(f"Returning JSON response: {result}") + else: # Default to JSON + result = json.dumps({ + 'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions) + }) return Response(content=result, media_type='application/json') except Exception as e: diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py index 223aa1dc7..52461c847 100644 --- a/model_docker_images/tests/test_inference.py +++ b/model_docker_images/tests/test_inference.py @@ -1,91 +1,318 @@ #!/usr/bin/env python -import requests +import os import json -import argparse import time +import argparse +import tempfile +import shutil +import subprocess +import requests +import pandas as pd +import numpy as np +from pathlib import Path -def test_inference_server(host="localhost", port=8080): - """ - Test the inference server running in the Docker container. - """ - base_url = f"http://{host}:{port}" +class MockModel: + """Mock SageMaker Model class that simulates the behavior of sagemaker.model.Model""" - # Test 1: Check the health endpoint - print("\n๐Ÿ” Testing /ping endpoint (health check)...") - try: - response = requests.get(f"{base_url}/ping", timeout=5) - if response.status_code == 200: - print("โœ… Health check succeeded") + def __init__(self, image_uri, model_data=None, role=None, **kwargs): + """ + Initialize a MockModel with parameters similar to a SageMaker Model. + + Args: + image_uri (str): The Docker image URI to use for inference + model_data (str): Path to model artifacts (S3 URI or local path) + role (str): AWS IAM role (not used in mock) + """ + self.image_uri = image_uri + self.model_data = model_data + self.role = role + self.kwargs = kwargs + self.temp_dir = None + self.container_id = None + self.endpoint_url = None + + def register(self, content_types=None, response_types=None, **kwargs): + """Mock model registration - just stores the parameters""" + self.content_types = content_types or ["application/json"] + self.response_types = response_types or ["application/json"] + for key, value in kwargs.items(): + setattr(self, key, value) + print(f"Mock registered model with content types: {self.content_types}") + return self + + def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=None): + """ + Deploy the model to a mock endpoint (local Docker container). + + Args: + instance_type (str): SageMaker instance type (ignored) + initial_instance_count (int): Number of instances (ignored) + endpoint_name (str): Endpoint name for identification + + Returns: + MockEndpoint: The deployed endpoint + """ + print(f"Deploying model to endpoint: {endpoint_name or 'default-endpoint'}") + + # Create a temp directory for model data if not provided + if self.model_data is None: + self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-inference-test-") + model_dir = self.temp_dir + + # Create a dummy model + print(f"Creating dummy model in {model_dir}") + import joblib + import xgboost as xgb + + # Train a simple model + model = xgb.XGBRegressor(objective='reg:squarederror') + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + y = np.array([10, 20, 30]) + model.fit(X, y) + + # Save the model + joblib.dump(model, os.path.join(model_dir, 'model.joblib')) + + # Save metadata + with open(os.path.join(model_dir, 'metadata.json'), 'w') as f: + json.dump({ + 'feature_names': ['feature1', 'feature2', 'feature3'], + 'model_type': 'regression' + }, f) + + self.model_data = model_dir else: - print(f"โŒ Health check failed with status code: {response.status_code}") - return False - except requests.exceptions.RequestException as e: - print(f"โŒ Health check failed with error: {e}") - print("Is the Docker container running on the specified port?") + # Use provided model_data + model_dir = self.model_data + + # Start the container + cmd = [ + "docker", "run", "-d", "--rm", + "-p", "8080:8080", + "-v", f"{model_dir}:/opt/ml/model", + "-e", "MODEL_PATH=/opt/ml/model", + ] + + # Add platform flag for Mac M1/M2/M3 users + if os.uname().machine == 'arm64': + cmd.insert(2, "--platform") + cmd.insert(3, "linux/amd64") + + # Add the image URI + cmd.append(self.image_uri) + + print(f"Starting inference container: {' '.join(cmd)}") + self.container_id = subprocess.check_output(cmd).decode('utf-8').strip() + + print(f"Waiting for container to initialize...") + time.sleep(5) # Give it time to start + + self.endpoint_url = 'http://localhost:8080' + return MockEndpoint(self) + + +class MockEndpoint: + """Mock SageMaker Endpoint for local testing""" + + def __init__(self, model): + """Initialize with a reference to the model""" + self.model = model + self.url = model.endpoint_url + + def predict(self, data, initial_args=None): + """ + Makes a prediction using the deployed model. + + Args: + data: Input data in format matching content_types + initial_args: Additional arguments (ignored) + + Returns: + The prediction result + """ + # Default to first registered content type + content_type = self.model.content_types[0] if hasattr(self.model, 'content_types') else 'application/json' + + # Format the data according to content type + if content_type == 'text/csv': + if isinstance(data, pd.DataFrame): + payload = data.to_csv(header=False, index=False) + elif isinstance(data, (list, np.ndarray)): + payload = pd.DataFrame(data).to_csv(header=False, index=False) + else: + payload = str(data) + else: + # Default to JSON + if isinstance(data, pd.DataFrame): + payload = data.to_json(orient='records') + elif isinstance(data, (list, np.ndarray)): + payload = json.dumps({"instances": data.tolist() if hasattr(data, 'tolist') else data}) + else: + payload = json.dumps(data) + + # Send the request to the container + try: + response = requests.post( + f"{self.url}/invocations", + data=payload, + headers={"Content-Type": content_type} + ) + + # Check for errors + if response.status_code != 200: + raise Exception(f"Prediction failed with status code {response.status_code}: {response.text}") + + # Parse response based on response type + if hasattr(self.model, 'response_types') and 'text/csv' in self.model.response_types: + # Parse CSV response + return pd.read_csv(pd.StringIO(response.text), header=None) + else: + # Parse JSON response + return response.json() + + except Exception as e: + print(f"Error during prediction: {e}") + raise + + def delete_endpoint(self): + """Clean up resources by stopping the container""" + print(f"Deleting endpoint (stopping container {self.model.container_id})") + if self.model.container_id: + subprocess.run(["docker", "stop", self.model.container_id], check=True) + self.model.container_id = None + + # Clean up temp directory if needed + if self.model.temp_dir and os.path.exists(self.model.temp_dir): + print(f"Cleaning up temporary directory: {self.model.temp_dir}") + shutil.rmtree(self.model.temp_dir) + self.model.temp_dir = None + + +def test_csv_inference(endpoint, test_data=None): + """Test inference with CSV data""" + print("\nTesting CSV inference...") + + if test_data is None: + # Create sample test data + test_data = pd.DataFrame([ + [1.0, 2.0, 3.0], + [4.0, 5.0, 6.0] + ]) + + try: + response = endpoint.predict(test_data) + print(f"Prediction response: {response}") + print("โœ… CSV inference test successful") + return True + except Exception as e: + print(f"โŒ CSV inference test failed: {e}") return False - # Test 2: Test the invocations endpoint with simple data - print("\n๐Ÿ” Testing /invocations endpoint with sample data...") - sample_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] + +def test_json_inference(endpoint, test_data=None): + """Test inference with JSON data""" + print("\nTesting JSON inference...") + + if test_data is None: + # Create sample test data + test_data = { + "instances": [ + [1.0, 2.0, 3.0], + [4.0, 5.0, 6.0] + ] + } try: - # Test with JSON data - response = requests.post( - f"{base_url}/invocations", - data=json.dumps(sample_data), - headers={"Content-Type": "application/json", "Accept": "application/json"}, - timeout=5 - ) + response = endpoint.predict(test_data) + print(f"Prediction response: {response}") + print("โœ… JSON inference test successful") + return True + except Exception as e: + print(f"โŒ JSON inference test failed: {e}") + return False + +def test_ping_endpoint(url): + """Test the /ping endpoint directly""" + print("\nTesting /ping endpoint...") + try: + response = requests.get(f"{url}/ping") + print(f"Response status: {response.status_code}") if response.status_code == 200: - print("โœ… Inference request succeeded") - try: - # Parse the JSON response - result = response.json() - print(f"๐Ÿ“Š Response: {result}") - return True - except json.JSONDecodeError as e: - print(f"โŒ Error parsing response as JSON: {e}") - print(f"Raw response: {response.text}") - # Try parsing as CSV - try: - lines = response.text.strip().split('\n') - values = [float(line) for line in lines] - print(f"๐Ÿ“Š CSV Response (converted): {values}") - return True - except Exception: - return False + print("โœ… Ping test successful") + return True else: - print(f"โŒ Inference request failed with status code: {response.status_code}") - print(f"Response text: {response.text}") + print(f"โŒ Ping test failed with status {response.status_code}") return False - except requests.exceptions.RequestException as e: - print(f"โŒ Inference request failed with error: {e}") + except Exception as e: + print(f"โŒ Ping test error: {e}") return False - print("\n๐ŸŽ‰ All tests passed! Your inference server is working correctly.") - return True +def main(): + """Run the test using MockModel and MockEndpoint""" + parser = argparse.ArgumentParser(description="Test SageMaker inference container") + parser.add_argument("--image", type=str, default="aws_model_inference:0.1", help="Inference image name:tag") + parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)") + args = parser.parse_args() -def run_docker_command(): - """ - Print the docker run command to help the user start the container. - """ - print("\n๐Ÿ“‹ To run your Docker container, use the following command:") - print("docker run -p 8080:8080 aws_model_inference:latest") - print("\nThis maps port 8080 from the container to port 8080 on your host machine.") + print(f"Testing inference container {args.image}") + # Create the model and endpoint + model = None + endpoint = None + success = False -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Test the AWS model inference server") - parser.add_argument("--host", default="localhost", help="Host where the inference server is running") - parser.add_argument("--port", type=int, default=8080, help="Port where the inference server is running") - parser.add_argument("--docker-cmd", action="store_true", help="Print the docker run command") + try: + # Create and deploy the model + model = MockModel( + image_uri=args.image, + model_data=args.model_dir, + role="mock-role" + ) - args = parser.parse_args() + # Register the model + model.register( + content_types=["text/csv", "application/json"], + response_types=["text/csv", "application/json"], + inference_instances=["ml.t2.medium"], + transform_instances=["ml.m5.large"], + description="Test model" + ) + + # Deploy the model + endpoint = model.deploy( + instance_type="local", + initial_instance_count=1, + endpoint_name="test-endpoint" + ) - if args.docker_cmd: - run_docker_command() + # Test the /ping endpoint + ping_success = test_ping_endpoint(endpoint.url) - test_inference_server(args.host, args.port) + # Test predictions + csv_success = test_csv_inference(endpoint) + json_success = test_json_inference(endpoint) + + # Overall success + success = ping_success and csv_success and json_success + + if success: + print("\nโœ… All inference tests passed successfully!") + else: + print("\nโŒ Some inference tests failed!") + + except Exception as e: + print(f"\nโŒ Error during inference testing: {e}") + finally: + # Clean up resources + if endpoint: + endpoint.delete_endpoint() + + # Return appropriate exit code + return 0 if success else 1 + + +if __name__ == "__main__": + exit(main()) From 84378f5d94ee199859f351861cac426525b172ed Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 13:12:24 -0700 Subject: [PATCH 11/35] fixing StringIO imports --- model_docker_images/inference/main.py | 3 ++- model_docker_images/tests/test_inference.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index 0430d93d6..c1b15ca11 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -3,6 +3,7 @@ import os import json import pandas as pd +from io import StringIO import joblib import logging @@ -85,7 +86,7 @@ async def invoke(request: Request): # Parse input data based on content type if 'text/csv' in content_type: s = body.decode('utf-8') - data = pd.read_csv(pd.StringIO(s), header=None) + data = pd.read_csv(StringIO(s), header=None) else: # Default to JSON json_str = body.decode('utf-8') data_json = json.loads(json_str) diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py index 52461c847..de4d48e25 100644 --- a/model_docker_images/tests/test_inference.py +++ b/model_docker_images/tests/test_inference.py @@ -9,7 +9,7 @@ import requests import pandas as pd import numpy as np -from pathlib import Path +from io import StringIO class MockModel: @@ -166,7 +166,7 @@ def predict(self, data, initial_args=None): # Parse response based on response type if hasattr(self.model, 'response_types') and 'text/csv' in self.model.response_types: # Parse CSV response - return pd.read_csv(pd.StringIO(response.text), header=None) + return pd.read_csv(StringIO(response.text), header=None) else: # Parse JSON response return response.json() From 5c2d99a9d85f0805c75500743433b084e9358dae Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 13:15:20 -0700 Subject: [PATCH 12/35] improved json handling --- model_docker_images/inference/main.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index c1b15ca11..1cb104f9c 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -90,7 +90,17 @@ async def invoke(request: Request): else: # Default to JSON json_str = body.decode('utf-8') data_json = json.loads(json_str) - data = pd.DataFrame(data_json) if not isinstance(data_json, pd.DataFrame) else data_json + + # Handle different JSON formats + if isinstance(data_json, dict) and "instances" in data_json: + # Format: {"instances": [[1,2,3], [4,5,6]]} + data = pd.DataFrame(data_json["instances"]) + elif isinstance(data_json, list) and all(isinstance(item, list) for item in data_json): + # Format: [[1,2,3], [4,5,6]] + data = pd.DataFrame(data_json) + else: + # Try to convert to DataFrame + data = pd.DataFrame(data_json) # Make prediction predictions = model.predict(data) From 24c34b2d6a807fc2a7c602f1832732cc3384ffee Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 13:16:15 -0700 Subject: [PATCH 13/35] change test data a bit --- model_docker_images/tests/test_inference.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py index de4d48e25..389e9c664 100644 --- a/model_docker_images/tests/test_inference.py +++ b/model_docker_images/tests/test_inference.py @@ -215,13 +215,11 @@ def test_json_inference(endpoint, test_data=None): print("\nTesting JSON inference...") if test_data is None: - # Create sample test data - test_data = { - "instances": [ - [1.0, 2.0, 3.0], - [4.0, 5.0, 6.0] - ] - } + # Create sample test data - use list of lists of floats + test_data = [ + [1.0, 2.0, 3.0], + [4.0, 5.0, 6.0] + ] try: response = endpoint.predict(test_data) From ad4a0636fbec475e0d9d709c38b5ab16d60f62a3 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 13:40:10 -0700 Subject: [PATCH 14/35] changing repo naming --- model_docker_images/scripts/build_deploy.sh | 39 ++++++++++++--------- model_docker_images/tests/test_inference.py | 2 +- model_docker_images/tests/test_training.py | 2 +- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh index 6ca52bdc2..d5829f7d5 100755 --- a/model_docker_images/scripts/build_deploy.sh +++ b/model_docker_images/scripts/build_deploy.sh @@ -6,11 +6,18 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" # Get the parent directory (project root) PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -# Configuration +# AWS Account ID +AWS_ACCOUNT_ID="507740646243" + +# Define repository names - used for both local and ECR images +TRAINING_REPO="aws-ml-images/py312-sklearn-xgb-training" +INFERENCE_REPO="aws-ml-images/py312-sklearn-xgb-inference" + +# Local directories TRAINING_DIR="$PROJECT_ROOT/training" INFERENCE_DIR="$PROJECT_ROOT/inference" -TRAINING_IMAGE="aws_model_training" -INFERENCE_IMAGE="aws_model_inference" + +# Image version IMAGE_VERSION=${1:-"0.1"} # Expect AWS_PROFILE to be set in the environment when deploying @@ -45,9 +52,9 @@ done # Function to build a Docker image build_image() { local dir=$1 - local image_name=$2 + local repo_name=$2 local tag=$3 - local full_name="${image_name}:${tag}" + local full_name="${repo_name}:${tag}" echo -e "${YELLOW}Building image: ${full_name}${NC}" @@ -67,20 +74,20 @@ build_image() { # Function to deploy an image to ECR deploy_image() { - local image_name=$1 + local repo_name=$1 local tag=$2 local use_latest=$3 - local full_name="${image_name}:${tag}" + local full_name="${repo_name}:${tag}" for REGION in "${REGION_LIST[@]}"; do echo "Processing region: ${REGION}" # Construct the ECR repository URL - ECR_REPO="507740646243.dkr.ecr.${REGION}.amazonaws.com/model_images/${image_name}" + ECR_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${repo_name}" AWS_ECR_IMAGE="${ECR_REPO}:${tag}" echo "Logging in to AWS ECR in ${REGION}..." aws ecr get-login-password --region ${REGION} --profile ${AWS_PROFILE} | \ - docker login --username AWS --password-stdin ${ECR_REPO} + docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com" echo "Tagging image for AWS ECR as ${AWS_ECR_IMAGE}..." docker tag ${full_name} ${AWS_ECR_IMAGE} @@ -102,13 +109,13 @@ deploy_image() { echo "======================================" echo "๐Ÿ—๏ธ Building training container" echo "======================================" -build_image "$TRAINING_DIR" "$TRAINING_IMAGE" "$IMAGE_VERSION" +build_image "$TRAINING_DIR" "$TRAINING_REPO" "$IMAGE_VERSION" # Build inference image echo "======================================" echo "๐Ÿ—๏ธ Building inference container" echo "======================================" -build_image "$INFERENCE_DIR" "$INFERENCE_IMAGE" "$IMAGE_VERSION" +build_image "$INFERENCE_DIR" "$INFERENCE_REPO" "$IMAGE_VERSION" echo "======================================" echo -e "${GREEN}โœ… All builds completed successfully!${NC}" @@ -121,11 +128,11 @@ if [ "$DEPLOY" = true ]; then # Deploy training image echo "Deploying training image..." - deploy_image "$TRAINING_IMAGE" "$IMAGE_VERSION" "$LATEST" + deploy_image "$TRAINING_REPO" "$IMAGE_VERSION" "$LATEST" # Deploy inference image echo "Deploying inference image..." - deploy_image "$INFERENCE_IMAGE" "$IMAGE_VERSION" "$LATEST" + deploy_image "$INFERENCE_REPO" "$IMAGE_VERSION" "$LATEST" echo "======================================" echo -e "${GREEN}โœ… Deployment complete!${NC}" @@ -136,10 +143,10 @@ else # Print information about the built images echo "======================================" echo "๐Ÿ“‹ Image information:" - echo "Training image: ${TRAINING_IMAGE}:${IMAGE_VERSION}" - echo "Inference image: ${INFERENCE_IMAGE}:${IMAGE_VERSION}" + echo "Training image: ${TRAINING_REPO}:${IMAGE_VERSION}" + echo "Inference image: ${INFERENCE_REPO}:${IMAGE_VERSION}" echo "======================================" # Inform about testing option - echo "To test these containers, run: $PROJECT_ROOT/tests/scripts/run_tests.sh ${IMAGE_VERSION}" + echo "To test these containers, run: $PROJECT_ROOT/tests/run_tests.sh ${IMAGE_VERSION}" fi diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py index 389e9c664..30262b4e2 100644 --- a/model_docker_images/tests/test_inference.py +++ b/model_docker_images/tests/test_inference.py @@ -251,7 +251,7 @@ def test_ping_endpoint(url): def main(): """Run the test using MockModel and MockEndpoint""" parser = argparse.ArgumentParser(description="Test SageMaker inference container") - parser.add_argument("--image", type=str, default="aws_model_inference:0.1", help="Inference image name:tag") + parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag") parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)") args = parser.parse_args() diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py index c7f3b8593..74562bf03 100644 --- a/model_docker_images/tests/test_training.py +++ b/model_docker_images/tests/test_training.py @@ -119,7 +119,7 @@ def cleanup(self): def main(): """Run the test using a MockEstimator""" parser = argparse.ArgumentParser(description="Test SageMaker training container") - parser.add_argument("--image", type=str, default="aws_model_training:0.1", help="Training image name:tag") + parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag") parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name") parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts") parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path") From 091fc3d37b8631d9568c773922879fa124460692 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 13:51:31 -0700 Subject: [PATCH 15/35] changing InferenceImage to ModelImages --- src/workbench/core/artifacts/model_core.py | 23 +++++++++++++++---- .../features_to_model/features_to_model.py | 6 ++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py index fcbe791a7..710e26b4c 100644 --- a/src/workbench/core/artifacts/model_core.py +++ b/src/workbench/core/artifacts/model_core.py @@ -35,10 +35,24 @@ class ModelType(Enum): UNKNOWN = "unknown" -class InferenceImage: +class ModelImages: """Class for retrieving locked Scikit-Learn inference images""" image_uris = { + ("us-east-1", "training", "0.1"): ( + "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1" + ), + ("us-east-1", "inference", "0.1"): ( + "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" + ), + ("us-west-2", "training", "0.1"): ( + "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1" + ), + ("us-west-2", "inference", "0.1"): ( + "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" + ), + + # These are the OLD locked SKLearn images ("us-east-1", "sklearn", "1.2.1"): ( "683313688378.dkr.ecr.us-east-1.amazonaws.com/" "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd" @@ -55,16 +69,17 @@ class InferenceImage: "246618743249.dkr.ecr.us-west-2.amazonaws.com/" "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd" ), + } @classmethod - def get_image_uri(cls, region, framework, version): - key = (region, framework, version) + def get_image_uri(cls, region, image_type="training", version="0.1"): + key = (region, image_type, version) if key in cls.image_uris: return cls.image_uris[key] else: raise ValueError( - f"No matching image found for region: {region}, framework: {framework}, version: {version}" + f"No matching image found for region: {region}, image_type: {image_type}, version: {version}" ) diff --git a/src/workbench/core/transforms/features_to_model/features_to_model.py b/src/workbench/core/transforms/features_to_model/features_to_model.py index 0fdc1c64d..fe95799e0 100644 --- a/src/workbench/core/transforms/features_to_model/features_to_model.py +++ b/src/workbench/core/transforms/features_to_model/features_to_model.py @@ -8,7 +8,7 @@ # Local Imports from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput from workbench.core.artifacts.feature_set_core import FeatureSetCore -from workbench.core.artifacts.model_core import ModelCore, ModelType, InferenceImage +from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages from workbench.core.artifacts.artifact import Artifact from workbench.model_scripts.script_generation import generate_model_script from workbench.utils.model_utils import supported_instance_types @@ -208,7 +208,7 @@ def transform_impl( source_dir = str(Path(script_path).parent) # Create a Sagemaker Model with our script - image = InferenceImage.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") + image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") self.estimator = SKLearn( entry_point=entry_point, source_dir=source_dir, @@ -268,7 +268,7 @@ def create_and_register_model(self): ) # Register our model - image = InferenceImage.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") + image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") self.log.important(f"Registering model {self.output_uuid} with image {image}...") model = self.estimator.create_model(role=self.workbench_role_arn) model.register( From 4b837cfdc7d4cb2694bbc12e6578a9a35c094e85 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 14:10:46 -0700 Subject: [PATCH 16/35] using new model images --- src/workbench/core/artifacts/model_core.py | 4 ++-- .../features_to_model/features_to_model.py | 12 ++++++------ .../model_scripts/light_xgb_model/requirements.txt | 5 +++-- .../model_scripts/light_xgb_model/xgb_model.template | 4 ++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py index 710e26b4c..50779831a 100644 --- a/src/workbench/core/artifacts/model_core.py +++ b/src/workbench/core/artifacts/model_core.py @@ -46,10 +46,10 @@ class ModelImages: "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" ), ("us-west-2", "training", "0.1"): ( - "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1" + "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1" ), ("us-west-2", "inference", "0.1"): ( - "174872318107.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" + "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" ), # These are the OLD locked SKLearn images diff --git a/src/workbench/core/transforms/features_to_model/features_to_model.py b/src/workbench/core/transforms/features_to_model/features_to_model.py index fe95799e0..e5916994f 100644 --- a/src/workbench/core/transforms/features_to_model/features_to_model.py +++ b/src/workbench/core/transforms/features_to_model/features_to_model.py @@ -1,7 +1,7 @@ """FeaturesToModel: Train/Create a Model from a Feature Set""" from pathlib import Path -from sagemaker.sklearn.estimator import SKLearn +from sagemaker.estimator import Estimator import awswrangler as wr from datetime import datetime, timezone @@ -111,6 +111,7 @@ def transform_impl( all_columns = feature_set.columns filter_list = [ "id", + "auto_id", "__index_level_0__", "write_time", "api_invocation_time", @@ -208,14 +209,14 @@ def transform_impl( source_dir = str(Path(script_path).parent) # Create a Sagemaker Model with our script - image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") - self.estimator = SKLearn( + image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "training", "0.1") + self.estimator = Estimator( entry_point=entry_point, source_dir=source_dir, role=self.workbench_role_arn, + instance_count=1, instance_type="ml.m5.large", sagemaker_session=self.sm_session, - framework_version="1.2-1", image_uri=image, metric_definitions=metric_definitions, ) @@ -268,12 +269,11 @@ def create_and_register_model(self): ) # Register our model - image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") + image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "inference", "0.1") self.log.important(f"Registering model {self.output_uuid} with image {image}...") model = self.estimator.create_model(role=self.workbench_role_arn) model.register( model_package_group_name=self.output_uuid, - framework_version="1.2.1", image_uri=image, content_types=["text/csv"], response_types=["text/csv"], diff --git a/src/workbench/model_scripts/light_xgb_model/requirements.txt b/src/workbench/model_scripts/light_xgb_model/requirements.txt index 25a034855..7ff58e74d 100644 --- a/src/workbench/model_scripts/light_xgb_model/requirements.txt +++ b/src/workbench/model_scripts/light_xgb_model/requirements.txt @@ -1,2 +1,3 @@ -xgboost==2.0.3 -awswrangler==3.8.0 \ No newline at end of file +xgboost-cpu==2.1.4 +pandas==2.2.3 +awswrangler==3.11.0 \ No newline at end of file diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template index a534b2164..9c53a4d90 100644 --- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template +++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template @@ -15,7 +15,7 @@ import awswrangler as wr from sklearn.metrics import ( mean_absolute_error, r2_score, - mean_squared_error, + root_mean_squared_error, precision_recall_fscore_support, confusion_matrix, ) @@ -261,7 +261,7 @@ if __name__ == "__main__": else: # Calculate various model performance metrics (regression) - rmse = mean_squared_error(df_val[target], preds, squared=False) + rmse = root_mean_squared_error(df_val[target], preds) mae = mean_absolute_error(df_val[target], preds) r2 = r2_score(df_val[target], preds) print(f"RMSE: {rmse:.3f}") From 2a80c66a4cdaf6b74f7539a18f9d35ac14935b61 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 14:13:47 -0700 Subject: [PATCH 17/35] unlocking scikit-learn version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ef8494177..85085806e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "cryptography >= 42.0.5", "ipython >= 8.17.2", "pyreadline3; sys_platform == 'win32'", - "scikit-learn >=1.4.2, <= 1.5.2", + "scikit-learn >=1.5.2", "joblib >= 1.3.2", "requests >= 2.26.0", "rdkit>=2024.3.2", From 1f6299cd6e8b485d6ed55e1536243c15c3047ca2 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 15:00:48 -0700 Subject: [PATCH 18/35] switching over to 'serve' script --- model_docker_images/inference/Dockerfile | 20 +++++++++++++++----- model_docker_images/inference/serve | 6 ++++++ 2 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 model_docker_images/inference/serve diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile index a09da2460..6433484bd 100644 --- a/model_docker_images/inference/Dockerfile +++ b/model_docker_images/inference/Dockerfile @@ -9,9 +9,19 @@ COPY requirements.txt /tmp/ # Install dependencies RUN pip install --no-cache-dir -r /tmp/requirements.txt -# Copy your server code -COPY main.py /app/ -WORKDIR /app +# Add the serve script +COPY serve /usr/local/bin/ +RUN chmod +x /usr/local/bin/serve -# Run the API server -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] \ No newline at end of file +# Copy the main.py/entrypoint script +COPY main.py /opt/program/ +WORKDIR /opt/program + +# Make port 8080 available for the web server +EXPOSE 8080 + +# Define environment variable +ENV PYTHONUNBUFFERED=TRUE + +# SageMaker will look for this +CMD ["serve"] \ No newline at end of file diff --git a/model_docker_images/inference/serve b/model_docker_images/inference/serve new file mode 100644 index 000000000..93d3d58fd --- /dev/null +++ b/model_docker_images/inference/serve @@ -0,0 +1,6 @@ +#!/bin/bash + +# SageMaker expect a 'serve' script to be found in the container which starts the model server. + +# Start the FastAPI server using Uvicorn +exec uvicorn main:app --host 0.0.0.0 --port 8080 \ No newline at end of file From 1cb2186f58dc17f204f26ea91959cde11f792f0e Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 15:01:30 -0700 Subject: [PATCH 19/35] cleaning up the requirements.txt files for models since our new training/inference images include these pacakges --- .../model_scripts/custom_models/chem_info/requirements.txt | 4 +--- .../model_scripts/custom_script_example/requirements.txt | 2 -- .../model_scripts/light_quant_regression/requirements.txt | 2 -- .../model_scripts/light_scikit_learn/requirements.txt | 4 +--- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/workbench/model_scripts/custom_models/chem_info/requirements.txt b/src/workbench/model_scripts/custom_models/chem_info/requirements.txt index 33ff11c23..68cb66c0f 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/requirements.txt +++ b/src/workbench/model_scripts/custom_models/chem_info/requirements.txt @@ -1,4 +1,2 @@ -scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework -awswrangler>=3.8.0 -rdkit>=2024.3.2 +rdkit>=2024.9.5 mordredcommunity>=2.0.6 \ No newline at end of file diff --git a/src/workbench/model_scripts/custom_script_example/requirements.txt b/src/workbench/model_scripts/custom_script_example/requirements.txt index 2b1dd27fd..e69de29bb 100644 --- a/src/workbench/model_scripts/custom_script_example/requirements.txt +++ b/src/workbench/model_scripts/custom_script_example/requirements.txt @@ -1,2 +0,0 @@ -scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework -awswrangler>=3.8.0 diff --git a/src/workbench/model_scripts/light_quant_regression/requirements.txt b/src/workbench/model_scripts/light_quant_regression/requirements.txt index 25a034855..e69de29bb 100644 --- a/src/workbench/model_scripts/light_quant_regression/requirements.txt +++ b/src/workbench/model_scripts/light_quant_regression/requirements.txt @@ -1,2 +0,0 @@ -xgboost==2.0.3 -awswrangler==3.8.0 \ No newline at end of file diff --git a/src/workbench/model_scripts/light_scikit_learn/requirements.txt b/src/workbench/model_scripts/light_scikit_learn/requirements.txt index 2a1bb2a2a..cf1b0394e 100644 --- a/src/workbench/model_scripts/light_scikit_learn/requirements.txt +++ b/src/workbench/model_scripts/light_scikit_learn/requirements.txt @@ -1,3 +1 @@ -scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework -umap-learn -awswrangler>=3.8.0 \ No newline at end of file +umap-learn \ No newline at end of file From 54e5eb3cda89e5d609d3f8773877737411544298 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sat, 1 Mar 2025 15:50:00 -0700 Subject: [PATCH 20/35] making the serve script executable --- model_docker_images/inference/serve | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 model_docker_images/inference/serve diff --git a/model_docker_images/inference/serve b/model_docker_images/inference/serve old mode 100644 new mode 100755 From c2f8a37f10083864dd9ec0787d89c77efe71095e Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 08:40:42 -0700 Subject: [PATCH 21/35] refactoring the training and inference containers --- model_docker_images/inference/main.py | 189 ++++++++++-------- model_docker_images/tests/test_inference.py | 41 +++- .../training/sagemaker_entrypoint.py | 58 +++--- 3 files changed, 171 insertions(+), 117 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index 1cb104f9c..5e74ad277 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -1,124 +1,145 @@ from fastapi import FastAPI, Request, Response from contextlib import asynccontextmanager import os +import sys import json -import pandas as pd -from io import StringIO -import joblib +import importlib.util import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -# Global variables for model and metadata +# Global variables model = None -model_metadata = None +inference_module = None + + +def get_inference_script(): + """Retrieve the entry point script name for SageMaker inference.""" + # Check SAGEMAKER_PROGRAM first + if "SAGEMAKER_PROGRAM" in os.environ: + return os.environ["SAGEMAKER_PROGRAM"] + + # For inference containers, check these common locations + model_server_config = "/opt/ml/model/model-config.json" + if os.path.exists(model_server_config): + try: + with open(model_server_config, "r") as f: + config = json.load(f) + if "inference_script" in config: + return config["inference_script"] + except Exception as e: + print(f"Error reading model-config.json: {e}") + + # Debug available environment variables + print("Available environment variables:") + for key in os.environ: + print(f" {key}: {os.environ[key]}") + + # Recursively list out all files in /opt/ml + print("Contents of /opt/ml:") + for root, dirs, files in os.walk("/opt/ml"): + for file in files: + print(f" {root}/{file}") + + +def get_model_script(): + """Retrieve the SAGEMAKER_PROGRAM from environment variable or hyperparameters.json.""" + if "SAGEMAKER_PROGRAM" in os.environ: + return os.environ["SAGEMAKER_PROGRAM"] + + # Look for hyperparameters.json + hyperparams_path = "/opt/ml/input/config/hyperparameters.json" + if os.path.exists(hyperparams_path): + try: + with open(hyperparams_path, "r") as f: + hyperparams = json.load(f) + if "sagemaker_program" in hyperparams: + return hyperparams["sagemaker_program"] + except Exception as e: + print(f"Error reading hyperparameters.json: {e}") + + # If no program is found, raise an error + raise ValueError("SAGEMAKER_PROGRAM not found in environment variables or hyperparameters.json") @asynccontextmanager async def lifespan(app: FastAPI): """Handle model loading on startup and cleanup on shutdown.""" - global model, model_metadata - model_path = os.environ.get('MODEL_PATH', '/opt/ml/model') - model_file = os.path.join(model_path, 'model.joblib') + global model, inference_module + + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") + code_dir = os.environ.get("SM_MODULE_DIR", "/opt/ml/code") + + # Add code_dir to sys.path so that any local utilities can be imported + if code_dir not in sys.path: + sys.path.insert(0, code_dir) + model_script = get_inference_script() try: - logger.info(f"Loading model from {model_path}") - - # Check if model file exists - if os.path.exists(model_file): - model = joblib.load(model_file) - logger.info(f"Model loaded successfully: {type(model)}") - else: - # Log the error and available files - logger.error(f"Model file not found at {model_file}") - if os.path.exists(model_path): - logger.error(f"Contents of {model_path}: {os.listdir(model_path)}") - else: - logger.error(f"Model directory {model_path} does not exist") - - # Fail fast - no fallback for production - raise FileNotFoundError(f"Required model file not found: {model_file}") - - # Load metadata if available - metadata_file = os.path.join(model_path, 'metadata.json') - if os.path.exists(metadata_file): - with open(metadata_file, 'r') as f: - model_metadata = json.load(f) - logger.info(f"Loaded model metadata") - else: - logger.info(f"No metadata found, using default") - model_metadata = {'feature_names': None} + logger.info(f"Loading model from {model_dir}") + logger.info(f"Loading inference code from {code_dir}") + + # Ensure directories exist + if not os.path.exists(model_dir): + raise FileNotFoundError(f"Model directory not found: {model_dir}") + if not os.path.exists(code_dir): + raise FileNotFoundError(f"Code directory not found: {code_dir}") + + # List directory contents for debugging + logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}") + logger.info(f"Contents of {code_dir}: {os.listdir(code_dir)}") + + # Load the inference module from source_dir + entry_point_path = os.path.join(code_dir, model_script) + if not os.path.exists(entry_point_path): + raise FileNotFoundError(f"Entry point script {model_script} not found in {code_dir}") + + logger.info(f"Importing inference module from {entry_point_path}") + spec = importlib.util.spec_from_file_location("inference_module", entry_point_path) + inference_module = importlib.util.module_from_spec(spec) + sys.modules["inference_module"] = inference_module + spec.loader.exec_module(inference_module) + + if not hasattr(inference_module, "model_fn"): + raise ImportError(f"Inference module {model_script} does not define model_fn") + + # Load the model using model_fn + logger.info("Calling model_fn to load the model") + model = inference_module.model_fn(model_dir) + logger.info(f"Model loaded successfully: {type(model)}") except Exception as e: - logger.error(f"Error loading model: {e}", exc_info=True) - # In production, we don't want to create fallback models - # Let the container fail to start + logger.error(f"Error initializing model: {e}", exc_info=True) raise - logger.info("Model initialization complete") yield + logger.info("Shutting down model server") app = FastAPI(lifespan=lifespan) -@app.get('/ping') +@app.get("/ping") def ping(): """Health check endpoint for SageMaker.""" - if model is not None: - return Response(status_code=200) - return Response(status_code=404) + return Response(status_code=200 if model else 404) -@app.post('/invocations') +@app.post("/invocations") async def invoke(request: Request): """Inference endpoint for SageMaker.""" - content_type = request.headers.get('Content-Type', '') - accept_type = request.headers.get('Accept', '') + content_type = request.headers.get("Content-Type", "") + accept_type = request.headers.get("Accept", "") try: - # Get request body body = await request.body() - - # Parse input data based on content type - if 'text/csv' in content_type: - s = body.decode('utf-8') - data = pd.read_csv(StringIO(s), header=None) - else: # Default to JSON - json_str = body.decode('utf-8') - data_json = json.loads(json_str) - - # Handle different JSON formats - if isinstance(data_json, dict) and "instances" in data_json: - # Format: {"instances": [[1,2,3], [4,5,6]]} - data = pd.DataFrame(data_json["instances"]) - elif isinstance(data_json, list) and all(isinstance(item, list) for item in data_json): - # Format: [[1,2,3], [4,5,6]] - data = pd.DataFrame(data_json) - else: - # Try to convert to DataFrame - data = pd.DataFrame(data_json) - - # Make prediction - predictions = model.predict(data) - - # Format response based on accept type - if 'text/csv' in accept_type: - result = pd.DataFrame(predictions).to_csv(header=False, index=False) - return Response(content=result, media_type='text/csv') - else: # Default to JSON - result = json.dumps({ - 'predictions': predictions.tolist() if hasattr(predictions, 'tolist') else float(predictions) - }) - return Response(content=result, media_type='application/json') - + data = inference_module.input_fn(body, content_type) + result = inference_module.predict_fn(data, model) + output_data, output_content_type = inference_module.output_fn(result, accept_type) + return Response(content=output_data, media_type=output_content_type) except Exception as e: logger.error(f"Error during inference: {e}", exc_info=True) - return Response( - content=json.dumps({"error": str(e)}), - status_code=500, - media_type="application/json" - ) + return Response(content=json.dumps({"error": str(e)}), status_code=500, media_type="application/json") diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py index 30262b4e2..fdef54fd9 100644 --- a/model_docker_images/tests/test_inference.py +++ b/model_docker_images/tests/test_inference.py @@ -101,12 +101,22 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non # Add the image URI cmd.append(self.image_uri) - print(f"Starting inference container: {' '.join(cmd)}") self.container_id = subprocess.check_output(cmd).decode('utf-8').strip() - print(f"Waiting for container to initialize...") - time.sleep(5) # Give it time to start + # Add this block immediately after starting the container + print(f"Container ID: {self.container_id}") + try: + # Give it a moment to start or fail + time.sleep(1) + + # Get container logs + logs = subprocess.check_output( + ["docker", "logs", self.container_id], stderr=subprocess.STDOUT + ).decode('utf-8') + print(f"Container startup logs:\n{logs}") + except Exception as e: + print(f"Error getting container logs: {e}") self.endpoint_url = 'http://localhost:8080' return MockEndpoint(self) @@ -120,6 +130,25 @@ def __init__(self, model): self.model = model self.url = model.endpoint_url + # Check container status and logs + try: + # Get container state + inspect_output = subprocess.check_output( + ["docker", "inspect", "--format", "{{.State.Status}}", model.container_id] + ).decode('utf-8').strip() + + print(f"Container status: {inspect_output}") + + # If not running, get the logs + if inspect_output != "running": + logs = subprocess.check_output( + ["docker", "logs", model.container_id], stderr=subprocess.STDOUT + ).decode('utf-8') + print(f"Container logs:\n{logs}") + raise RuntimeError("Container failed to start properly") + except Exception as e: + print(f"Error checking container: {e}") + def predict(self, data, initial_args=None): """ Makes a prediction using the deployed model. @@ -179,8 +208,10 @@ def delete_endpoint(self): """Clean up resources by stopping the container""" print(f"Deleting endpoint (stopping container {self.model.container_id})") if self.model.container_id: - subprocess.run(["docker", "stop", self.model.container_id], check=True) - self.model.container_id = None + try: + subprocess.run(["docker", "stop", self.model.container_id], check=False) + except Exception as e: + print(f"Error stopping container: {e}") # Clean up temp directory if needed if self.model.temp_dir and os.path.exists(self.model.temp_dir): diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index 807a82ee3..21bd7f919 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -9,11 +9,8 @@ from urllib.parse import urlparse # Set up logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) logger = logging.getLogger('sagemaker-entry-point') +logger.setLevel(logging.INFO) def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"): @@ -31,8 +28,7 @@ def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"): os.makedirs(target_dir, exist_ok=True) with tarfile.open(local_tar, "r:gz") as tar: - tar.extractall(path=target_dir) - + tar.extractall(path=target_dir, numeric_owner=True) return target_dir except Exception as e: logger.error(f"Error downloading from S3: {e}") @@ -74,7 +70,18 @@ def setup_environment(): def main(): - logger.info("Starting SageMaker container entry point") + logger.info("Starting Workbench training container...") + + # Debug available environment variables + logger.info("Available environment variables:") + for key in os.environ: + logger.info(f" {key}: {os.environ[key]}") + + # Recursively list out all files in /opt/ml + logger.info("Contents of /opt/ml:") + for root, dirs, files in os.walk("/opt/ml"): + for file in files: + logger.info(f" {root}/{file}") # Load hyperparameters hyperparams_path = '/opt/ml/input/config/hyperparameters.json' @@ -84,46 +91,41 @@ def main(): with open(hyperparams_path, 'r') as f: hyperparams = json.load(f) + logger.info(f"Hyperparameters: {hyperparams}") - # Get program name from hyperparameters or environment + # Get program name from hyperparameters if 'sagemaker_program' in hyperparams: - program = hyperparams['sagemaker_program'].strip('"\'') - os.environ['SAGEMAKER_PROGRAM'] = program - elif 'SAGEMAKER_PROGRAM' in os.environ: - program = os.environ['SAGEMAKER_PROGRAM'] + training_script = hyperparams['sagemaker_program'].strip('"\'') else: - logger.error("sagemaker_program not found in hyperparameters or environment!") + logger.error("sagemaker_program not found in hyperparameters!") sys.exit(1) - logger.info(f"Using program: {program}") + logger.info(f"Using training_script: {training_script}") - # Get source directory - submit_dir = "/opt/ml/code" + # Get source directory from hyperparameters if 'sagemaker_submit_directory' in hyperparams: - submit_dir_value = hyperparams['sagemaker_submit_directory'].strip('"\'') + code_directory = hyperparams['sagemaker_submit_directory'].strip('"\'') # Handle S3 vs local path - if submit_dir_value.startswith('s3://'): - submit_dir = download_and_extract_s3(submit_dir_value) - else: - submit_dir = submit_dir_value - if not os.path.exists(submit_dir): - logger.error(f"Local directory not found: {submit_dir}") - sys.exit(1) + if code_directory.startswith('s3://'): + code_directory = download_and_extract_s3(code_directory) + elif not os.path.exists(code_directory): + logger.error(f"Local code directory not found: {code_directory}") + sys.exit(1) # Install requirements if present - install_requirements(os.path.join(submit_dir, "requirements.txt")) + install_requirements(os.path.join(code_directory, "requirements.txt")) # Set up environment variables setup_environment() - # Find entry point script - entry_point = os.path.join(submit_dir, program) + # Find training script (entry point) + entry_point = os.path.join(code_directory, training_script) if not os.path.exists(entry_point): logger.error(f"Entry point not found: {entry_point}") sys.exit(1) - logger.info(f"Executing: {program}") + logger.info(f"Executing: {entry_point}") # Execute the training script with SageMaker arguments cmd = [ From 5f230e38e0d3c6604068771d57d48f49ab10dad4 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 10:22:04 -0700 Subject: [PATCH 22/35] simplifying the inference entry point --- model_docker_images/inference/main.py | 106 ++++++++------------------ 1 file changed, 33 insertions(+), 73 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index 5e74ad277..e40ab5a36 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -15,53 +15,21 @@ inference_module = None -def get_inference_script(): - """Retrieve the entry point script name for SageMaker inference.""" - # Check SAGEMAKER_PROGRAM first - if "SAGEMAKER_PROGRAM" in os.environ: - return os.environ["SAGEMAKER_PROGRAM"] - - # For inference containers, check these common locations - model_server_config = "/opt/ml/model/model-config.json" - if os.path.exists(model_server_config): - try: - with open(model_server_config, "r") as f: - config = json.load(f) - if "inference_script" in config: - return config["inference_script"] - except Exception as e: - print(f"Error reading model-config.json: {e}") - - # Debug available environment variables - print("Available environment variables:") - for key in os.environ: - print(f" {key}: {os.environ[key]}") - - # Recursively list out all files in /opt/ml - print("Contents of /opt/ml:") - for root, dirs, files in os.walk("/opt/ml"): - for file in files: - print(f" {root}/{file}") - - -def get_model_script(): - """Retrieve the SAGEMAKER_PROGRAM from environment variable or hyperparameters.json.""" - if "SAGEMAKER_PROGRAM" in os.environ: - return os.environ["SAGEMAKER_PROGRAM"] - - # Look for hyperparameters.json - hyperparams_path = "/opt/ml/input/config/hyperparameters.json" - if os.path.exists(hyperparams_path): - try: - with open(hyperparams_path, "r") as f: - hyperparams = json.load(f) - if "sagemaker_program" in hyperparams: - return hyperparams["sagemaker_program"] - except Exception as e: - print(f"Error reading hyperparameters.json: {e}") - - # If no program is found, raise an error - raise ValueError("SAGEMAKER_PROGRAM not found in environment variables or hyperparameters.json") +def get_inference_script(model_dir: str) -> str: + """Retrieve the inference script name + + Args: + model_dir (str): The directory containing the model artifacts + + Returns: + str: The name of the inference script + """ + + # Get the path to the inference-metadata.json file + inference_meta_path = os.path.join(model_dir, "inference-metadata.json") + with open(inference_meta_path, "r") as f: + config = json.load(f) + return config["inference_script"] @asynccontextmanager @@ -69,41 +37,33 @@ async def lifespan(app: FastAPI): """Handle model loading on startup and cleanup on shutdown.""" global model, inference_module + # Note: SageMaker will put model.tar.gz in /opt/ml/model + # which includes the model artifacts and inference code model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") - code_dir = os.environ.get("SM_MODULE_DIR", "/opt/ml/code") + inference_script = get_inference_script(model_dir) - # Add code_dir to sys.path so that any local utilities can be imported - if code_dir not in sys.path: - sys.path.insert(0, code_dir) - model_script = get_inference_script() + # List directory contents for debugging + logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}") try: - logger.info(f"Loading model from {model_dir}") - logger.info(f"Loading inference code from {code_dir}") - - # Ensure directories exist - if not os.path.exists(model_dir): - raise FileNotFoundError(f"Model directory not found: {model_dir}") - if not os.path.exists(code_dir): - raise FileNotFoundError(f"Code directory not found: {code_dir}") - - # List directory contents for debugging - logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}") - logger.info(f"Contents of {code_dir}: {os.listdir(code_dir)}") - - # Load the inference module from source_dir - entry_point_path = os.path.join(code_dir, model_script) - if not os.path.exists(entry_point_path): - raise FileNotFoundError(f"Entry point script {model_script} not found in {code_dir}") - - logger.info(f"Importing inference module from {entry_point_path}") - spec = importlib.util.spec_from_file_location("inference_module", entry_point_path) + # Load the inference script from source_dir + inference_script_path = os.path.join(model_dir, inference_script) + if not os.path.exists(inference_script_path): + raise FileNotFoundError(f"Inference script not found: {inference_script_path}") + + # Add the code directory to the Python path + os.environ["PYTHONPATH"] = f"{model_dir}:{os.environ.get('PYTHONPATH', '')}" + + # Import the inference module + logger.info(f"Importing inference module from {inference_script_path}") + spec = importlib.util.spec_from_file_location("inference_module", inference_script_path) inference_module = importlib.util.module_from_spec(spec) sys.modules["inference_module"] = inference_module spec.loader.exec_module(inference_module) + # Check if model_fn is defined if not hasattr(inference_module, "model_fn"): - raise ImportError(f"Inference module {model_script} does not define model_fn") + raise ImportError(f"Inference module {inference_script_path} does not define model_fn") # Load the model using model_fn logger.info("Calling model_fn to load the model") From cf008ecbafa694a207dff0da264e6d48af30dc4c Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 10:23:37 -0700 Subject: [PATCH 23/35] adding code and metadata to model dir (for pick up by inference container) --- .../training/sagemaker_entrypoint.py | 85 +++++++++---------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index 21bd7f919..19371b01b 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os import sys +import shutil import json import tarfile import subprocess @@ -9,8 +10,8 @@ from urllib.parse import urlparse # Set up logging -logger = logging.getLogger('sagemaker-entry-point') -logger.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"): @@ -51,38 +52,26 @@ def install_requirements(requirements_path): logger.info(f"No requirements file found at {requirements_path}") -def setup_environment(): - """Set up SageMaker environment variables.""" - env_vars = { - "SM_MODEL_DIR": "/opt/ml/model", - "SM_OUTPUT_DATA_DIR": "/opt/ml/output/data", - "SM_CHANNEL_TRAIN": "/opt/ml/input/data/train", - "SM_OUTPUT_DIR": "/opt/ml/output", - "SM_INPUT_DIR": "/opt/ml/input", - "SM_INPUT_CONFIG_DIR": "/opt/ml/input/config" - } +def include_code_and_meta_for_inference(model_dir, code_dir, entry_point): + """Include code and some metadata for the inference container""" + logger.info("Including code and metadata for inference...") - for key, value in env_vars.items(): - os.environ[key] = str(value) - os.makedirs(value, exist_ok=True) + # Create inference metadata file + inference_metadata = {"inference_script": entry_point} - logger.info(f"SageMaker environment initialized.") + # Write metadata to model directory + metadata_path = os.path.join(model_dir, "inference-metadata.json") + with open(metadata_path, "w") as fp: + json.dump(inference_metadata, fp) + + # Copy code to model directory + for file in os.listdir(code_dir): + shutil.copy2(os.path.join(code_dir, file), model_dir) def main(): logger.info("Starting Workbench training container...") - # Debug available environment variables - logger.info("Available environment variables:") - for key in os.environ: - logger.info(f" {key}: {os.environ[key]}") - - # Recursively list out all files in /opt/ml - logger.info("Contents of /opt/ml:") - for root, dirs, files in os.walk("/opt/ml"): - for file in files: - logger.info(f" {root}/{file}") - # Load hyperparameters hyperparams_path = '/opt/ml/input/config/hyperparameters.json' if not os.path.exists(hyperparams_path): @@ -116,29 +105,35 @@ def main(): # Install requirements if present install_requirements(os.path.join(code_directory, "requirements.txt")) - # Set up environment variables - setup_environment() - - # Find training script (entry point) - entry_point = os.path.join(code_directory, training_script) - if not os.path.exists(entry_point): - logger.error(f"Entry point not found: {entry_point}") + # Find training script + training_script_path = os.path.join(code_directory, training_script) + if not os.path.exists(training_script_path): + logger.error(f"Training script not found: {training_script_path}") sys.exit(1) - logger.info(f"Executing: {entry_point}") + logger.info(f"Executing: {training_script_path}") - # Execute the training script with SageMaker arguments - cmd = [ - sys.executable, entry_point, - "--model-dir", os.environ["SM_MODEL_DIR"], - "--output-data-dir", os.environ["SM_OUTPUT_DATA_DIR"], - "--train", os.environ["SM_CHANNEL_TRAIN"] - ] + # Add the code directory to the Python path + os.environ["PYTHONPATH"] = f"{code_directory}:{os.environ.get('PYTHONPATH', '')}" + # Call the training script and then include code and meta for inference try: - os.execv(sys.executable, cmd) - except Exception as e: - logger.error(f"Failed to execute entry point: {e}") + subprocess.check_call([ + sys.executable, training_script_path, + "--model-dir", os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), + "--output-data-dir", os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"), + "--train", os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"), + ]) + + # After training completes, include code and meta in the model.tar.gz + include_code_and_meta_for_inference( + model_dir=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), + code_dir=code_directory, + entry_point=training_script + ) + + except subprocess.CalledProcessError as e: + logger.error(f"Failed to execute training script: {e}") sys.exit(1) From fefca7805fecf7421c5240fd989d89f0064eee74 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 10:24:41 -0700 Subject: [PATCH 24/35] changing script args so they don't fail if ENV vars aren't set --- .../custom_models/chem_info/molecular_descriptors.py | 9 ++++++--- .../custom_models/chem_info/morgan_fingerprints.py | 9 ++++++--- .../custom_models/chem_info/tautomerize.py | 9 ++++++--- .../custom_script_example/custom_model_script.py | 9 ++++++--- .../light_quant_regression/quant_regression.template | 8 ++++---- .../light_scikit_learn/scikit_learn.template | 10 ++++++---- .../model_scripts/light_xgb_model/xgb_model.template | 9 +++++---- 7 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py index c71e81934..8a6c248b5 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +++ b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py @@ -22,10 +22,13 @@ # and save the model artifacts to the model directory. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # This model doesn't get trained, it just a feature creation 'model' diff --git a/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py b/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py index 4fede9442..a3889715a 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +++ b/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py @@ -24,10 +24,13 @@ # and save the model artifacts to the model directory. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # This model doesn't get trained, it just a feature creation 'model' diff --git a/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py b/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py index 72c2afe34..16e479a61 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py +++ b/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py @@ -23,10 +23,13 @@ # This section (__main__) is where SageMaker will execute the job and save the model artifacts. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # This model doesn't get trained; it's a feature processing 'model' diff --git a/src/workbench/model_scripts/custom_script_example/custom_model_script.py b/src/workbench/model_scripts/custom_script_example/custom_model_script.py index 3e2a8db0a..e6492ba3d 100644 --- a/src/workbench/model_scripts/custom_script_example/custom_model_script.py +++ b/src/workbench/model_scripts/custom_script_example/custom_model_script.py @@ -48,10 +48,13 @@ def expand_proba_column(df: pd.DataFrame, class_labels: list) -> pd.DataFrame: # and save the model artifacts to the model directory. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # Load the training data diff --git a/src/workbench/model_scripts/light_quant_regression/quant_regression.template b/src/workbench/model_scripts/light_quant_regression/quant_regression.template index 8ea2a6e6d..109ca190a 100644 --- a/src/workbench/model_scripts/light_quant_regression/quant_regression.template +++ b/src/workbench/model_scripts/light_quant_regression/quant_regression.template @@ -86,13 +86,13 @@ if __name__ == "__main__": quantiles = [0.05, 0.25, 0.50, 0.75, 0.95] q_models = {} - # Sagemaker specific arguments. Defaults are set in the environment variables. + # Script arguments for input/output directories parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) parser.add_argument( - "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"] + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") ) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() # Read the training data into DataFrames diff --git a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template index e98f752ce..f0deaf1d4 100644 --- a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template +++ b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template @@ -89,11 +89,13 @@ if __name__ == "__main__": train_all_data = TEMPLATE_PARAMS["train_all_data"] validation_split = 0.2 - # SageMaker arguments for input/output directories + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # Load training data from the specified directory diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template index 9c53a4d90..e0a7fc9c0 100644 --- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template +++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template @@ -131,15 +131,16 @@ if __name__ == "__main__": train_all_data = TEMPLATE_PARAMS["train_all_data"] validation_split = 0.2 - # Sagemaker specific arguments. Defaults are set in the environment variables. + # Script arguments for input/output directories parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) parser.add_argument( - "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"] + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") ) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() + # Read the training data into DataFrames training_files = [ os.path.join(args.train, file) From 951511ba4dd58c684e5ddb92bd81d59c9c7a8ec7 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 10:41:15 -0700 Subject: [PATCH 25/35] changing script args so they don't fail if ENV vars aren't set --- model_docker_images/tests/example_model_script.py | 10 +++++----- .../custom_script_example/custom_model_script.py | 2 +- .../light_quant_regression/quant_regression.template | 2 +- .../light_scikit_learn/scikit_learn.template | 2 +- .../model_scripts/light_xgb_model/xgb_model.template | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py index bb736ac7c..6a2569c16 100644 --- a/model_docker_images/tests/example_model_script.py +++ b/model_docker_images/tests/example_model_script.py @@ -131,13 +131,13 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p train_all_data = TEMPLATE_PARAMS["train_all_data"] validation_split = 0.2 - # Sagemaker specific arguments. Defaults are set in the environment variables. + # Script arguments for input/output directories parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) parser.add_argument( - "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"] + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") ) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() # Read the training data into DataFrames @@ -342,7 +342,7 @@ def predict_fn(df, model) -> pd.DataFrame: """ # Grab our feature columns (from training) - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) print(f"Model Features: {model_features}") diff --git a/src/workbench/model_scripts/custom_script_example/custom_model_script.py b/src/workbench/model_scripts/custom_script_example/custom_model_script.py index e6492ba3d..c36d4ff15 100644 --- a/src/workbench/model_scripts/custom_script_example/custom_model_script.py +++ b/src/workbench/model_scripts/custom_script_example/custom_model_script.py @@ -147,7 +147,7 @@ def output_fn(output_df, accept_type): # Prediction function def predict_fn(df, model): - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) diff --git a/src/workbench/model_scripts/light_quant_regression/quant_regression.template b/src/workbench/model_scripts/light_quant_regression/quant_regression.template index 109ca190a..f638c5f75 100644 --- a/src/workbench/model_scripts/light_quant_regression/quant_regression.template +++ b/src/workbench/model_scripts/light_quant_regression/quant_regression.template @@ -280,7 +280,7 @@ def predict_fn(df, models) -> pd.DataFrame: """ # Grab our feature columns (from training) - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) print(f"Model Features: {model_features}") diff --git a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template index f0deaf1d4..f79565947 100644 --- a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template +++ b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template @@ -244,7 +244,7 @@ def output_fn(output_df, accept_type): def predict_fn(df, model): """Make predictions or apply transformations using the model and return the DataFrame with results.""" - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") # Load feature columns from the saved file with open(os.path.join(model_dir, "feature_columns.json")) as fp: diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template index e0a7fc9c0..f02fca231 100644 --- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template +++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template @@ -340,7 +340,7 @@ def predict_fn(df, model) -> pd.DataFrame: """ # Grab our feature columns (from training) - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) print(f"Model Features: {model_features}") From e56fe74ef5a157c2d7b2a6469749c79a7c26752e Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 11:05:49 -0700 Subject: [PATCH 26/35] changing logic for copying code files/directories --- model_docker_images/training/sagemaker_entrypoint.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index 19371b01b..2fa251bbc 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -64,9 +64,10 @@ def include_code_and_meta_for_inference(model_dir, code_dir, entry_point): with open(metadata_path, "w") as fp: json.dump(inference_metadata, fp) - # Copy code to model directory - for file in os.listdir(code_dir): - shutil.copy2(os.path.join(code_dir, file), model_dir) + # Copy code to model directory, copy ALL files and directories recursively (except __pycache__) + for item in os.listdir(code_dir): + if item != "__pycache__": + shutil.copytree(os.path.join(code_dir, item), os.path.join(model_dir, item)) def main(): From f4242221a4f745e8f8f68de79c2593ebcba2ad5a Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 11:27:20 -0700 Subject: [PATCH 27/35] PYTHONPATH doesn't work with importlib, so use sys.path --- model_docker_images/inference/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index e40ab5a36..849d79fd1 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -51,8 +51,8 @@ async def lifespan(app: FastAPI): if not os.path.exists(inference_script_path): raise FileNotFoundError(f"Inference script not found: {inference_script_path}") - # Add the code directory to the Python path - os.environ["PYTHONPATH"] = f"{model_dir}:{os.environ.get('PYTHONPATH', '')}" + # Ensure the model directory is in the Python path + sys.path.insert(0, model_dir) # Import the inference module logger.info(f"Importing inference module from {inference_script_path}") From 56a59ae9371afbe23b64d835b2d0dad0c03bdebd Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 11:28:51 -0700 Subject: [PATCH 28/35] fixing the file/dir copy from code to model dir --- model_docker_images/training/sagemaker_entrypoint.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index 2fa251bbc..bca57544f 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -65,9 +65,13 @@ def include_code_and_meta_for_inference(model_dir, code_dir, entry_point): json.dump(inference_metadata, fp) # Copy code to model directory, copy ALL files and directories recursively (except __pycache__) + # Also list all files/directories that are being copied for item in os.listdir(code_dir): - if item != "__pycache__": - shutil.copytree(os.path.join(code_dir, item), os.path.join(model_dir, item)) + if item == "__pycache__": + continue + src, dst = os.path.join(code_dir, item), os.path.join(model_dir, item) + shutil.copytree(src, dst, dirs_exist_ok=True) if os.path.isdir(src) else shutil.copy2(src, dst) + logger.info(f"Copied: {src} -> {dst}") def main(): From 8b7af3351aa68e3cb02782b0cee89e08517b8b7f Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 11:38:53 -0700 Subject: [PATCH 29/35] flake8/linter cleanup --- .../tests/example_model_script.py | 31 +++--- model_docker_images/tests/test_inference.py | 95 ++++++++----------- model_docker_images/tests/test_training.py | 53 ++++++----- .../training/sagemaker_entrypoint.py | 40 ++++---- src/workbench/core/artifacts/model_core.py | 2 - 5 files changed, 110 insertions(+), 111 deletions(-) diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py index 6a2569c16..11a1d0767 100644 --- a/model_docker_images/tests/example_model_script.py +++ b/model_docker_images/tests/example_model_script.py @@ -2,9 +2,18 @@ TEMPLATE_PARAMS = { "model_type": "regressor", "target_column": "class_number_of_rings", - "feature_list": ['length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'auto_id'], + "feature_list": [ + "length", + "diameter", + "height", + "whole_weight", + "shucked_weight", + "viscera_weight", + "shell_weight", + "auto_id", + ], "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/training/abalone-regression", - "train_all_data": False + "train_all_data": False, } # Imports for XGB Model @@ -141,11 +150,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p args = parser.parse_args() # Read the training data into DataFrames - training_files = [ - os.path.join(args.train, file) - for file in os.listdir(args.train) - if file.endswith(".csv") - ] + training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")] print(f"Training Files: {training_files}") # Combine files and read them all into a single pandas dataframe @@ -172,9 +177,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p else: # Just do a random training Split print("WARNING: No training column found, splitting data with random state=42") - df_train, df_val = train_test_split( - all_df, test_size=validation_split, random_state=42 - ) + df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42) print(f"FIT/TRAIN: {df_train.shape}") print(f"VALIDATION: {df_val.shape}") @@ -233,9 +236,7 @@ def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> p label_names = label_encoder.classes_ # Calculate various model performance metrics - scores = precision_recall_fscore_support( - df_val[target], preds, average=None, labels=label_names - ) + scores = precision_recall_fscore_support(df_val[target], preds, average=None, labels=label_names) # Put the scores into a dataframe score_df = pd.DataFrame( @@ -289,7 +290,9 @@ def model_fn(model_dir): model_path = os.path.join(model_dir, "xgb_model.json") with open(model_path, "r") as f: model_json = json.load(f) - saved_model_type = json.loads(model_json.get('learner').get('attributes').get('scikit_learn')).get('_estimator_type') + saved_model_type = json.loads(model_json.get("learner").get("attributes").get("scikit_learn")).get( + "_estimator_type" + ) if saved_model_type == "classifier": model = xgb.XGBClassifier() elif saved_model_type == "regressor": diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py index fdef54fd9..8520f1b6e 100644 --- a/model_docker_images/tests/test_inference.py +++ b/model_docker_images/tests/test_inference.py @@ -66,20 +66,17 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non import xgboost as xgb # Train a simple model - model = xgb.XGBRegressor(objective='reg:squarederror') + model = xgb.XGBRegressor(objective="reg:squarederror") X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) y = np.array([10, 20, 30]) model.fit(X, y) # Save the model - joblib.dump(model, os.path.join(model_dir, 'model.joblib')) + joblib.dump(model, os.path.join(model_dir, "model.joblib")) # Save metadata - with open(os.path.join(model_dir, 'metadata.json'), 'w') as f: - json.dump({ - 'feature_names': ['feature1', 'feature2', 'feature3'], - 'model_type': 'regression' - }, f) + with open(os.path.join(model_dir, "metadata.json"), "w") as f: + json.dump({"feature_names": ["feature1", "feature2", "feature3"], "model_type": "regression"}, f) self.model_data = model_dir else: @@ -88,21 +85,27 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non # Start the container cmd = [ - "docker", "run", "-d", "--rm", - "-p", "8080:8080", - "-v", f"{model_dir}:/opt/ml/model", - "-e", "MODEL_PATH=/opt/ml/model", + "docker", + "run", + "-d", + "--rm", + "-p", + "8080:8080", + "-v", + f"{model_dir}:/opt/ml/model", + "-e", + "MODEL_PATH=/opt/ml/model", ] # Add platform flag for Mac M1/M2/M3 users - if os.uname().machine == 'arm64': + if os.uname().machine == "arm64": cmd.insert(2, "--platform") cmd.insert(3, "linux/amd64") # Add the image URI cmd.append(self.image_uri) print(f"Starting inference container: {' '.join(cmd)}") - self.container_id = subprocess.check_output(cmd).decode('utf-8').strip() + self.container_id = subprocess.check_output(cmd).decode("utf-8").strip() # Add this block immediately after starting the container print(f"Container ID: {self.container_id}") @@ -111,14 +114,14 @@ def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=Non time.sleep(1) # Get container logs - logs = subprocess.check_output( - ["docker", "logs", self.container_id], stderr=subprocess.STDOUT - ).decode('utf-8') + logs = subprocess.check_output(["docker", "logs", self.container_id], stderr=subprocess.STDOUT).decode( + "utf-8" + ) print(f"Container startup logs:\n{logs}") except Exception as e: print(f"Error getting container logs: {e}") - self.endpoint_url = 'http://localhost:8080' + self.endpoint_url = "http://localhost:8080" return MockEndpoint(self) @@ -133,17 +136,19 @@ def __init__(self, model): # Check container status and logs try: # Get container state - inspect_output = subprocess.check_output( - ["docker", "inspect", "--format", "{{.State.Status}}", model.container_id] - ).decode('utf-8').strip() + inspect_output = ( + subprocess.check_output(["docker", "inspect", "--format", "{{.State.Status}}", model.container_id]) + .decode("utf-8") + .strip() + ) print(f"Container status: {inspect_output}") # If not running, get the logs if inspect_output != "running": - logs = subprocess.check_output( - ["docker", "logs", model.container_id], stderr=subprocess.STDOUT - ).decode('utf-8') + logs = subprocess.check_output(["docker", "logs", model.container_id], stderr=subprocess.STDOUT).decode( + "utf-8" + ) print(f"Container logs:\n{logs}") raise RuntimeError("Container failed to start properly") except Exception as e: @@ -161,10 +166,10 @@ def predict(self, data, initial_args=None): The prediction result """ # Default to first registered content type - content_type = self.model.content_types[0] if hasattr(self.model, 'content_types') else 'application/json' + content_type = self.model.content_types[0] if hasattr(self.model, "content_types") else "application/json" # Format the data according to content type - if content_type == 'text/csv': + if content_type == "text/csv": if isinstance(data, pd.DataFrame): payload = data.to_csv(header=False, index=False) elif isinstance(data, (list, np.ndarray)): @@ -174,26 +179,22 @@ def predict(self, data, initial_args=None): else: # Default to JSON if isinstance(data, pd.DataFrame): - payload = data.to_json(orient='records') + payload = data.to_json(orient="records") elif isinstance(data, (list, np.ndarray)): - payload = json.dumps({"instances": data.tolist() if hasattr(data, 'tolist') else data}) + payload = json.dumps({"instances": data.tolist() if hasattr(data, "tolist") else data}) else: payload = json.dumps(data) # Send the request to the container try: - response = requests.post( - f"{self.url}/invocations", - data=payload, - headers={"Content-Type": content_type} - ) + response = requests.post(f"{self.url}/invocations", data=payload, headers={"Content-Type": content_type}) # Check for errors if response.status_code != 200: raise Exception(f"Prediction failed with status code {response.status_code}: {response.text}") # Parse response based on response type - if hasattr(self.model, 'response_types') and 'text/csv' in self.model.response_types: + if hasattr(self.model, "response_types") and "text/csv" in self.model.response_types: # Parse CSV response return pd.read_csv(StringIO(response.text), header=None) else: @@ -226,10 +227,7 @@ def test_csv_inference(endpoint, test_data=None): if test_data is None: # Create sample test data - test_data = pd.DataFrame([ - [1.0, 2.0, 3.0], - [4.0, 5.0, 6.0] - ]) + test_data = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) try: response = endpoint.predict(test_data) @@ -247,10 +245,7 @@ def test_json_inference(endpoint, test_data=None): if test_data is None: # Create sample test data - use list of lists of floats - test_data = [ - [1.0, 2.0, 3.0], - [4.0, 5.0, 6.0] - ] + test_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] try: response = endpoint.predict(test_data) @@ -282,7 +277,9 @@ def test_ping_endpoint(url): def main(): """Run the test using MockModel and MockEndpoint""" parser = argparse.ArgumentParser(description="Test SageMaker inference container") - parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag") + parser.add_argument( + "--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag" + ) parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)") args = parser.parse_args() @@ -295,11 +292,7 @@ def main(): try: # Create and deploy the model - model = MockModel( - image_uri=args.image, - model_data=args.model_dir, - role="mock-role" - ) + model = MockModel(image_uri=args.image, model_data=args.model_dir, role="mock-role") # Register the model model.register( @@ -307,15 +300,11 @@ def main(): response_types=["text/csv", "application/json"], inference_instances=["ml.t2.medium"], transform_instances=["ml.m5.large"], - description="Test model" + description="Test model", ) # Deploy the model - endpoint = model.deploy( - instance_type="local", - initial_instance_count=1, - endpoint_name="test-endpoint" - ) + endpoint = model.deploy(instance_type="local", initial_instance_count=1, endpoint_name="test-endpoint") # Test the /ping endpoint ping_success = test_ping_endpoint(endpoint.url) diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py index 74562bf03..f6c64c3ff 100644 --- a/model_docker_images/tests/test_training.py +++ b/model_docker_images/tests/test_training.py @@ -30,7 +30,7 @@ def fit(self, inputs, job_name=None, logs=True): print(f"Created test environment at: {self.temp_dir}") # Create directories - for path in ['input/data/train', 'input/config', 'model', 'output/data', 'code']: + for path in ["input/data/train", "input/config", "model", "output/data", "code"]: os.makedirs(f"{self.temp_dir}/{path}", exist_ok=True) # Copy data files @@ -57,7 +57,7 @@ def fit(self, inputs, job_name=None, logs=True): all_hyperparams = { **self.hyperparameters, "sagemaker_program": self.entry_point, - "sagemaker_submit_directory": "/opt/ml/code" + "sagemaker_submit_directory": "/opt/ml/code", } with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f: @@ -65,20 +65,30 @@ def fit(self, inputs, job_name=None, logs=True): # Run the container cmd = [ - "docker", "run", "--rm", - "-v", f"{self.temp_dir}/input:/opt/ml/input", - "-v", f"{self.temp_dir}/model:/opt/ml/model", - "-v", f"{self.temp_dir}/output:/opt/ml/output", - "-v", f"{self.temp_dir}/code:/opt/ml/code", - "-e", f"SAGEMAKER_PROGRAM={self.entry_point}", - "-e", "SM_MODEL_DIR=/opt/ml/model", - "-e", "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", - "-e", "SM_CHANNEL_TRAIN=/opt/ml/input/data/train", - self.image_uri + "docker", + "run", + "--rm", + "-v", + f"{self.temp_dir}/input:/opt/ml/input", + "-v", + f"{self.temp_dir}/model:/opt/ml/model", + "-v", + f"{self.temp_dir}/output:/opt/ml/output", + "-v", + f"{self.temp_dir}/code:/opt/ml/code", + "-e", + f"SAGEMAKER_PROGRAM={self.entry_point}", + "-e", + "SM_MODEL_DIR=/opt/ml/model", + "-e", + "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", + "-e", + "SM_CHANNEL_TRAIN=/opt/ml/input/data/train", + self.image_uri, ] # Add platform flag for Mac M1/M2/M3 users - if os.uname().machine == 'arm64': + if os.uname().machine == "arm64": cmd.insert(2, "--platform") cmd.insert(3, "linux/amd64") @@ -119,7 +129,9 @@ def cleanup(self): def main(): """Run the test using a MockEstimator""" parser = argparse.ArgumentParser(description="Test SageMaker training container") - parser.add_argument("--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag") + parser.add_argument( + "--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag" + ) parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name") parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts") parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path") @@ -135,17 +147,10 @@ def main(): print(f"Testing with image {args.image}, script {args.entry_point}") # Create and run the estimator - estimator = MockEstimator( - image_uri=args.image, - entry_point=args.entry_point, - source_dir=source_dir - ) + estimator = MockEstimator(image_uri=args.image, entry_point=args.entry_point, source_dir=source_dir) try: - estimator.fit( - inputs={"train": data_path}, - job_name="mock-training-job" - ) + estimator.fit(inputs={"train": data_path}, job_name="mock-training-job") print("โœ… Training completed successfully") except Exception as e: print(f"โŒ Training failed: {e}") @@ -155,4 +160,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py index bca57544f..ee70e355b 100644 --- a/model_docker_images/training/sagemaker_entrypoint.py +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -41,9 +41,7 @@ def install_requirements(requirements_path): if os.path.exists(requirements_path): logger.info(f"Installing dependencies from {requirements_path}...") try: - subprocess.check_call([ - sys.executable, "-m", "pip", "install", "-r", requirements_path - ]) + subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path]) logger.info("Requirements installed successfully.") except subprocess.CalledProcessError as e: logger.error(f"Error installing requirements: {e}") @@ -78,18 +76,18 @@ def main(): logger.info("Starting Workbench training container...") # Load hyperparameters - hyperparams_path = '/opt/ml/input/config/hyperparameters.json' + hyperparams_path = "/opt/ml/input/config/hyperparameters.json" if not os.path.exists(hyperparams_path): logger.error("hyperparameters.json not found!") sys.exit(1) - with open(hyperparams_path, 'r') as f: + with open(hyperparams_path, "r") as f: hyperparams = json.load(f) logger.info(f"Hyperparameters: {hyperparams}") # Get program name from hyperparameters - if 'sagemaker_program' in hyperparams: - training_script = hyperparams['sagemaker_program'].strip('"\'') + if "sagemaker_program" in hyperparams: + training_script = hyperparams["sagemaker_program"].strip("\"'") else: logger.error("sagemaker_program not found in hyperparameters!") sys.exit(1) @@ -97,11 +95,11 @@ def main(): logger.info(f"Using training_script: {training_script}") # Get source directory from hyperparameters - if 'sagemaker_submit_directory' in hyperparams: - code_directory = hyperparams['sagemaker_submit_directory'].strip('"\'') + if "sagemaker_submit_directory" in hyperparams: + code_directory = hyperparams["sagemaker_submit_directory"].strip("\"'") # Handle S3 vs local path - if code_directory.startswith('s3://'): + if code_directory.startswith("s3://"): code_directory = download_and_extract_s3(code_directory) elif not os.path.exists(code_directory): logger.error(f"Local code directory not found: {code_directory}") @@ -123,18 +121,24 @@ def main(): # Call the training script and then include code and meta for inference try: - subprocess.check_call([ - sys.executable, training_script_path, - "--model-dir", os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), - "--output-data-dir", os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"), - "--train", os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"), - ]) + subprocess.check_call( + [ + sys.executable, + training_script_path, + "--model-dir", + os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), + "--output-data-dir", + os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"), + "--train", + os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"), + ] + ) # After training completes, include code and meta in the model.tar.gz include_code_and_meta_for_inference( model_dir=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), code_dir=code_directory, - entry_point=training_script + entry_point=training_script, ) except subprocess.CalledProcessError as e: @@ -142,5 +146,5 @@ def main(): sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py index 50779831a..c29d864ba 100644 --- a/src/workbench/core/artifacts/model_core.py +++ b/src/workbench/core/artifacts/model_core.py @@ -51,7 +51,6 @@ class ModelImages: ("us-west-2", "inference", "0.1"): ( "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" ), - # These are the OLD locked SKLearn images ("us-east-1", "sklearn", "1.2.1"): ( "683313688378.dkr.ecr.us-east-1.amazonaws.com/" @@ -69,7 +68,6 @@ class ModelImages: "246618743249.dkr.ecr.us-west-2.amazonaws.com/" "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd" ), - } @classmethod From 8868c88165926efc76dd2ef764cac48369f5a914 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 11:57:15 -0700 Subject: [PATCH 30/35] adding install requirements.txt for inference entry point --- model_docker_images/inference/main.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index 849d79fd1..fb30829d0 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -5,6 +5,7 @@ import json import importlib.util import logging +import subprocess # Set up logging logging.basicConfig(level=logging.INFO) @@ -32,6 +33,20 @@ def get_inference_script(model_dir: str) -> str: return config["inference_script"] +def install_requirements(requirements_path): + """Install Python dependencies from requirements file.""" + if os.path.exists(requirements_path): + logger.info(f"Installing dependencies from {requirements_path}...") + try: + subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path]) + logger.info("Requirements installed successfully.") + except subprocess.CalledProcessError as e: + logger.error(f"Error installing requirements: {e}") + sys.exit(1) + else: + logger.info(f"No requirements file found at {requirements_path}") + + @asynccontextmanager async def lifespan(app: FastAPI): """Handle model loading on startup and cleanup on shutdown.""" @@ -51,6 +66,9 @@ async def lifespan(app: FastAPI): if not os.path.exists(inference_script_path): raise FileNotFoundError(f"Inference script not found: {inference_script_path}") + # Install requirements if present + install_requirements(os.path.join(model_dir, "requirements.txt")) + # Ensure the model directory is in the Python path sys.path.insert(0, model_dir) From f9b0db8e4a38d3ba631ca8c9309a616f2b321cee Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 12:34:44 -0700 Subject: [PATCH 31/35] putting in a better pip install (with cache) and better ping response --- model_docker_images/inference/main.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index fb30829d0..6949d002c 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -6,6 +6,7 @@ import importlib.util import logging import subprocess +import site # Set up logging logging.basicConfig(level=logging.INFO) @@ -34,11 +35,29 @@ def get_inference_script(model_dir: str) -> str: def install_requirements(requirements_path): - """Install Python dependencies from requirements file.""" + """Install Python dependencies from requirements file. + Uses a persistent cache to speed up container cold starts. + Note: Inference containers don't have root access, so we + use the --user flag and add the user package path manually. + """ if os.path.exists(requirements_path): logger.info(f"Installing dependencies from {requirements_path}...") + + # Define a persistent cache location + pip_cache_dir = "/opt/ml/model/.cache/pip" + os.environ["PIP_CACHE_DIR"] = pip_cache_dir + try: - subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path]) + subprocess.check_call([ + sys.executable, "-m", "pip", "install", + "--cache-dir", pip_cache_dir, # Enable caching + "--disable-pip-version-check", + "--no-warn-script-location", + "--user", + "-r", requirements_path + ]) + # Ensure Python can find user-installed packages + sys.path.append(site.getusersitepackages()) logger.info("Requirements installed successfully.") except subprocess.CalledProcessError as e: logger.error(f"Error installing requirements: {e}") @@ -103,7 +122,8 @@ async def lifespan(app: FastAPI): @app.get("/ping") def ping(): """Health check endpoint for SageMaker.""" - return Response(status_code=200 if model else 404) + # Check if the inference module is loaded + return Response(status_code=200 if inference_module else 500) @app.post("/invocations") From 9246be4f85990c05874930581669b8eff129a979 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 12:34:59 -0700 Subject: [PATCH 32/35] flake8/linter cleanup --- model_docker_images/inference/main.py | 29 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py index 6949d002c..7cf6fe585 100644 --- a/model_docker_images/inference/main.py +++ b/model_docker_images/inference/main.py @@ -36,9 +36,9 @@ def get_inference_script(model_dir: str) -> str: def install_requirements(requirements_path): """Install Python dependencies from requirements file. - Uses a persistent cache to speed up container cold starts. - Note: Inference containers don't have root access, so we - use the --user flag and add the user package path manually. + Uses a persistent cache to speed up container cold starts. + Note: Inference containers don't have root access, so we + use the --user flag and add the user package path manually. """ if os.path.exists(requirements_path): logger.info(f"Installing dependencies from {requirements_path}...") @@ -48,14 +48,21 @@ def install_requirements(requirements_path): os.environ["PIP_CACHE_DIR"] = pip_cache_dir try: - subprocess.check_call([ - sys.executable, "-m", "pip", "install", - "--cache-dir", pip_cache_dir, # Enable caching - "--disable-pip-version-check", - "--no-warn-script-location", - "--user", - "-r", requirements_path - ]) + subprocess.check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "--cache-dir", + pip_cache_dir, # Enable caching + "--disable-pip-version-check", + "--no-warn-script-location", + "--user", + "-r", + requirements_path, + ] + ) # Ensure Python can find user-installed packages sys.path.append(site.getusersitepackages()) logger.info("Requirements installed successfully.") From 9da5875326e0674e5590f56e2dd31402b5bba94d Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 13:09:06 -0700 Subject: [PATCH 33/35] new version of rdkit --- applications/compound_explorer/requirements.txt | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/compound_explorer/requirements.txt b/applications/compound_explorer/requirements.txt index 6f047826d..1fc2cecb3 100644 --- a/applications/compound_explorer/requirements.txt +++ b/applications/compound_explorer/requirements.txt @@ -18,7 +18,7 @@ dash-bootstrap-templates >= 1.3.0 dash_ag_grid tabulate >= 0.9.0 shap>=0.43.0 -rdkit>=2024.3.2 +rdkit>=2024.9.5 mordredcommunity>=2.0.6 networkx>=3.2 matplotlib>=3.9.2 diff --git a/pyproject.toml b/pyproject.toml index 85085806e..20a2aca0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "scikit-learn >=1.5.2", "joblib >= 1.3.2", "requests >= 2.26.0", - "rdkit>=2024.3.2", + "rdkit>=2024.9.5", "mordredcommunity>=2.0.6", ] From 4911c052135131629553cd55178b8db1dd34cf7b Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 13:10:12 -0700 Subject: [PATCH 34/35] unlocking scikit-learn version --- applications/compound_explorer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/compound_explorer/requirements.txt b/applications/compound_explorer/requirements.txt index 1fc2cecb3..0171f8e8a 100644 --- a/applications/compound_explorer/requirements.txt +++ b/applications/compound_explorer/requirements.txt @@ -8,7 +8,7 @@ sagemaker >= 2.143 cryptography>=42.0.5 ipython>=8.17.2 xgboost>=2.0.3 -scikit-learn >=1.4.2, <= 1.5.2 +scikit-learn >=1.5.2 joblib>=1.3.2 requests>=2.32.0 plotly >= 5.18.0 From 7b44ec883aa6e0ad1b40e2c95db3d56fc3f3bce6 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 2 Mar 2025 13:43:16 -0700 Subject: [PATCH 35/35] fix test --- tests/specific/capital_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/specific/capital_tests.py b/tests/specific/capital_tests.py index fcd64a87f..4ac939f74 100644 --- a/tests/specific/capital_tests.py +++ b/tests/specific/capital_tests.py @@ -6,8 +6,7 @@ @pytest.mark.long def test(): # Create a new Data Source from an S3 Path (or a local file) - source_path = "s3://workbench-public-data/common/aBaLone.CSV" - # source_path = "/full/path/to/local/file.csv" + source_path = "s3://workbench-public-data/common/abalone.csv" my_data = DataSource(source_path) pprint(my_data.summary()) pprint(my_data.details())