diff --git a/applications/compound_explorer/requirements.txt b/applications/compound_explorer/requirements.txt index 6f047826d..0171f8e8a 100644 --- a/applications/compound_explorer/requirements.txt +++ b/applications/compound_explorer/requirements.txt @@ -8,7 +8,7 @@ sagemaker >= 2.143 cryptography>=42.0.5 ipython>=8.17.2 xgboost>=2.0.3 -scikit-learn >=1.4.2, <= 1.5.2 +scikit-learn >=1.5.2 joblib>=1.3.2 requests>=2.32.0 plotly >= 5.18.0 @@ -18,7 +18,7 @@ dash-bootstrap-templates >= 1.3.0 dash_ag_grid tabulate >= 0.9.0 shap>=0.43.0 -rdkit>=2024.3.2 +rdkit>=2024.9.5 mordredcommunity>=2.0.6 networkx>=3.2 matplotlib>=3.9.2 diff --git a/model_docker_images/Readme.md b/model_docker_images/Readme.md new file mode 100644 index 000000000..e69de29bb diff --git a/model_docker_images/inference/Dockerfile b/model_docker_images/inference/Dockerfile new file mode 100644 index 000000000..6433484bd --- /dev/null +++ b/model_docker_images/inference/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.12-slim + +# Install Vim +RUN apt-get update && apt-get install -y vim + +# Copy requirements file +COPY requirements.txt /tmp/ + +# Install dependencies +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +# Add the serve script +COPY serve /usr/local/bin/ +RUN chmod +x /usr/local/bin/serve + +# Copy the main.py/entrypoint script +COPY main.py /opt/program/ +WORKDIR /opt/program + +# Make port 8080 available for the web server +EXPOSE 8080 + +# Define environment variable +ENV PYTHONUNBUFFERED=TRUE + +# SageMaker will look for this +CMD ["serve"] \ No newline at end of file diff --git a/model_docker_images/inference/main.py b/model_docker_images/inference/main.py new file mode 100644 index 000000000..7cf6fe585 --- /dev/null +++ b/model_docker_images/inference/main.py @@ -0,0 +1,150 @@ +from fastapi import FastAPI, Request, Response +from contextlib import asynccontextmanager +import os +import sys +import json +import importlib.util +import logging +import subprocess +import site + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Global variables +model = None +inference_module = None + + +def get_inference_script(model_dir: str) -> str: + """Retrieve the inference script name + + Args: + model_dir (str): The directory containing the model artifacts + + Returns: + str: The name of the inference script + """ + + # Get the path to the inference-metadata.json file + inference_meta_path = os.path.join(model_dir, "inference-metadata.json") + with open(inference_meta_path, "r") as f: + config = json.load(f) + return config["inference_script"] + + +def install_requirements(requirements_path): + """Install Python dependencies from requirements file. + Uses a persistent cache to speed up container cold starts. + Note: Inference containers don't have root access, so we + use the --user flag and add the user package path manually. + """ + if os.path.exists(requirements_path): + logger.info(f"Installing dependencies from {requirements_path}...") + + # Define a persistent cache location + pip_cache_dir = "/opt/ml/model/.cache/pip" + os.environ["PIP_CACHE_DIR"] = pip_cache_dir + + try: + subprocess.check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "--cache-dir", + pip_cache_dir, # Enable caching + "--disable-pip-version-check", + "--no-warn-script-location", + "--user", + "-r", + requirements_path, + ] + ) + # Ensure Python can find user-installed packages + sys.path.append(site.getusersitepackages()) + logger.info("Requirements installed successfully.") + except subprocess.CalledProcessError as e: + logger.error(f"Error installing requirements: {e}") + sys.exit(1) + else: + logger.info(f"No requirements file found at {requirements_path}") + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Handle model loading on startup and cleanup on shutdown.""" + global model, inference_module + + # Note: SageMaker will put model.tar.gz in /opt/ml/model + # which includes the model artifacts and inference code + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") + inference_script = get_inference_script(model_dir) + + # List directory contents for debugging + logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}") + + try: + # Load the inference script from source_dir + inference_script_path = os.path.join(model_dir, inference_script) + if not os.path.exists(inference_script_path): + raise FileNotFoundError(f"Inference script not found: {inference_script_path}") + + # Install requirements if present + install_requirements(os.path.join(model_dir, "requirements.txt")) + + # Ensure the model directory is in the Python path + sys.path.insert(0, model_dir) + + # Import the inference module + logger.info(f"Importing inference module from {inference_script_path}") + spec = importlib.util.spec_from_file_location("inference_module", inference_script_path) + inference_module = importlib.util.module_from_spec(spec) + sys.modules["inference_module"] = inference_module + spec.loader.exec_module(inference_module) + + # Check if model_fn is defined + if not hasattr(inference_module, "model_fn"): + raise ImportError(f"Inference module {inference_script_path} does not define model_fn") + + # Load the model using model_fn + logger.info("Calling model_fn to load the model") + model = inference_module.model_fn(model_dir) + logger.info(f"Model loaded successfully: {type(model)}") + + except Exception as e: + logger.error(f"Error initializing model: {e}", exc_info=True) + raise + + yield + + logger.info("Shutting down model server") + + +app = FastAPI(lifespan=lifespan) + + +@app.get("/ping") +def ping(): + """Health check endpoint for SageMaker.""" + # Check if the inference module is loaded + return Response(status_code=200 if inference_module else 500) + + +@app.post("/invocations") +async def invoke(request: Request): + """Inference endpoint for SageMaker.""" + content_type = request.headers.get("Content-Type", "") + accept_type = request.headers.get("Accept", "") + + try: + body = await request.body() + data = inference_module.input_fn(body, content_type) + result = inference_module.predict_fn(data, model) + output_data, output_content_type = inference_module.output_fn(result, accept_type) + return Response(content=output_data, media_type=output_content_type) + except Exception as e: + logger.error(f"Error during inference: {e}", exc_info=True) + return Response(content=json.dumps({"error": str(e)}), status_code=500, media_type="application/json") diff --git a/model_docker_images/inference/requirements.txt b/model_docker_images/inference/requirements.txt new file mode 100644 index 000000000..ea8a26be8 --- /dev/null +++ b/model_docker_images/inference/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.115.10 +uvicorn==0.34.0 +scikit-learn==1.6.1 +xgboost-cpu==2.1.4 +pandas==2.2.3 +awswrangler==3.11.0 +joblib==1.4.2 \ No newline at end of file diff --git a/model_docker_images/inference/serve b/model_docker_images/inference/serve new file mode 100755 index 000000000..93d3d58fd --- /dev/null +++ b/model_docker_images/inference/serve @@ -0,0 +1,6 @@ +#!/bin/bash + +# SageMaker expect a 'serve' script to be found in the container which starts the model server. + +# Start the FastAPI server using Uvicorn +exec uvicorn main:app --host 0.0.0.0 --port 8080 \ No newline at end of file diff --git a/model_docker_images/scripts/build_deploy.sh b/model_docker_images/scripts/build_deploy.sh new file mode 100755 index 000000000..d5829f7d5 --- /dev/null +++ b/model_docker_images/scripts/build_deploy.sh @@ -0,0 +1,152 @@ +#!/bin/bash +set -e + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +# Get the parent directory (project root) +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# AWS Account ID +AWS_ACCOUNT_ID="507740646243" + +# Define repository names - used for both local and ECR images +TRAINING_REPO="aws-ml-images/py312-sklearn-xgb-training" +INFERENCE_REPO="aws-ml-images/py312-sklearn-xgb-inference" + +# Local directories +TRAINING_DIR="$PROJECT_ROOT/training" +INFERENCE_DIR="$PROJECT_ROOT/inference" + +# Image version +IMAGE_VERSION=${1:-"0.1"} + +# Expect AWS_PROFILE to be set in the environment when deploying +if [ "$2" == "--deploy" ]; then + : "${AWS_PROFILE:?AWS_PROFILE environment variable is not set.}" +fi + +# Define the regions to deploy to. +REGION_LIST=("us-east-1" "us-west-2") + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Parse arguments +DEPLOY=false +LATEST=false +for arg in "$@"; do + case $arg in + --deploy) + DEPLOY=true + ;; + --latest) + LATEST=true + ;; + *) + ;; + esac +done + +# Function to build a Docker image +build_image() { + local dir=$1 + local repo_name=$2 + local tag=$3 + local full_name="${repo_name}:${tag}" + + echo -e "${YELLOW}Building image: ${full_name}${NC}" + + # Check if Dockerfile exists + if [ ! -f "$dir/Dockerfile" ]; then + echo "โŒ Error: Dockerfile not found in $dir" + return 1 + fi + + # Build the image for AMD64 architecture + echo "Building local Docker image ${full_name} for linux/amd64..." + docker build --platform linux/amd64 -t $full_name $dir + + echo -e "${GREEN}โœ… Successfully built: ${full_name}${NC}" + return 0 +} + +# Function to deploy an image to ECR +deploy_image() { + local repo_name=$1 + local tag=$2 + local use_latest=$3 + local full_name="${repo_name}:${tag}" + + for REGION in "${REGION_LIST[@]}"; do + echo "Processing region: ${REGION}" + # Construct the ECR repository URL + ECR_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${repo_name}" + AWS_ECR_IMAGE="${ECR_REPO}:${tag}" + + echo "Logging in to AWS ECR in ${REGION}..." + aws ecr get-login-password --region ${REGION} --profile ${AWS_PROFILE} | \ + docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com" + + echo "Tagging image for AWS ECR as ${AWS_ECR_IMAGE}..." + docker tag ${full_name} ${AWS_ECR_IMAGE} + + echo "Pushing Docker image to AWS ECR: ${AWS_ECR_IMAGE}..." + docker push ${AWS_ECR_IMAGE} + + if [ "$use_latest" = true ]; then + AWS_ECR_LATEST="${ECR_REPO}:latest" + echo "Tagging AWS ECR image as latest: ${AWS_ECR_LATEST}..." + docker tag ${full_name} ${AWS_ECR_LATEST} + echo "Pushing Docker image to AWS ECR: ${AWS_ECR_LATEST}..." + docker push ${AWS_ECR_LATEST} + fi + done +} + +# Build training image +echo "======================================" +echo "๐Ÿ—๏ธ Building training container" +echo "======================================" +build_image "$TRAINING_DIR" "$TRAINING_REPO" "$IMAGE_VERSION" + +# Build inference image +echo "======================================" +echo "๐Ÿ—๏ธ Building inference container" +echo "======================================" +build_image "$INFERENCE_DIR" "$INFERENCE_REPO" "$IMAGE_VERSION" + +echo "======================================" +echo -e "${GREEN}โœ… All builds completed successfully!${NC}" +echo "======================================" + +if [ "$DEPLOY" = true ]; then + echo "======================================" + echo "๐Ÿš€ Deploying containers to ECR" + echo "======================================" + + # Deploy training image + echo "Deploying training image..." + deploy_image "$TRAINING_REPO" "$IMAGE_VERSION" "$LATEST" + + # Deploy inference image + echo "Deploying inference image..." + deploy_image "$INFERENCE_REPO" "$IMAGE_VERSION" "$LATEST" + + echo "======================================" + echo -e "${GREEN}โœ… Deployment complete!${NC}" + echo "======================================" +else + echo "Local build complete. Use --deploy to push the images to AWS ECR in regions: ${REGION_LIST[*]}." + + # Print information about the built images + echo "======================================" + echo "๐Ÿ“‹ Image information:" + echo "Training image: ${TRAINING_REPO}:${IMAGE_VERSION}" + echo "Inference image: ${INFERENCE_REPO}:${IMAGE_VERSION}" + echo "======================================" + + # Inform about testing option + echo "To test these containers, run: $PROJECT_ROOT/tests/run_tests.sh ${IMAGE_VERSION}" +fi diff --git a/model_docker_images/tests/data/abalone_sm.csv b/model_docker_images/tests/data/abalone_sm.csv new file mode 100644 index 000000000..0198e6bc8 --- /dev/null +++ b/model_docker_images/tests/data/abalone_sm.csv @@ -0,0 +1,100 @@ +sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,class_number_of_rings,auto_id +M,0.53,0.43,0.135,0.879,0.28,0.2165,0.25,10,3400 +M,0.645,0.49,0.16,1.251,0.5355,0.3345,0.3165,9,2614 +F,0.69,0.545,0.205,1.933,0.7855,0.429,0.498,13,2618 +I,0.55,0.4,0.135,0.717,0.3315,0.1495,0.221,9,3663 +I,0.415,0.33,0.09,0.3595,0.17,0.081,0.09,6,920 +M,0.465,0.36,0.105,0.488,0.188,0.0845,0.19,10,3137 +M,0.59,0.475,0.155,0.857,0.356,0.174,0.28,13,2225 +M,0.52,0.425,0.155,0.7735,0.297,0.123,0.255,17,3271 +M,0.695,0.525,0.175,1.742,0.696,0.389,0.505,12,2621 +F,0.585,0.42,0.155,1.034,0.437,0.2225,0.32,11,3667 +I,0.525,0.385,0.13,0.607,0.2355,0.125,0.195,8,2754 +F,0.675,0.51,0.185,1.473,0.6295,0.3025,0.4245,11,1971 +I,0.435,0.335,0.105,0.3535,0.156,0.05,0.1135,7,3016 +I,0.435,0.345,0.12,0.3215,0.13,0.056,0.1185,7,1844 +I,0.525,0.4,0.125,0.5655,0.2435,0.119,0.175,8,3810 +F,0.52,0.395,0.18,0.64,0.158,0.11,0.245,22,675 +I,0.415,0.315,0.105,0.33,0.1405,0.0705,0.095,6,2508 +I,0.415,0.325,0.115,0.3285,0.1405,0.051,0.106,12,2378 +I,0.575,0.44,0.15,0.983,0.486,0.215,0.239,8,3666 +I,0.55,0.435,0.14,0.7535,0.3285,0.1555,0.2325,10,1314 +M,0.675,0.515,0.15,1.312,0.556,0.2845,0.4115,11,1970 +I,0.43,0.325,0.09,0.425,0.217,0.087,0.095,7,926 +F,0.67,0.54,0.165,1.5015,0.518,0.358,0.505,14,420 +M,0.745,0.565,0.215,1.931,0.896,0.4585,0.5,11,1205 +M,0.57,0.45,0.14,0.9275,0.477,0.1605,0.2515,8,3819 +F,0.605,0.48,0.175,1.1685,0.4815,0.2305,0.356,9,3822 +M,0.48,0.375,0.115,0.6765,0.3205,0.1065,0.17,6,949 +F,0.58,0.45,0.17,0.9705,0.4615,0.232,0.248,9,2908 +I,0.42,0.31,0.095,0.279,0.1255,0.051,0.088,6,1078 +M,0.705,0.56,0.22,1.981,0.8175,0.3085,0.76,14,168 +F,0.59,0.465,0.16,1.1005,0.506,0.2525,0.295,13,2259 +I,0.33,0.25,0.095,0.2085,0.102,0.0395,0.052,7,1220 +F,0.595,0.465,0.155,1.026,0.4645,0.112,0.305,12,1351 +I,0.36,0.275,0.11,0.2335,0.095,0.0525,0.085,10,440 +I,0.46,0.35,0.115,0.4155,0.18,0.098,0.1175,7,1092 +F,0.675,0.52,0.175,1.494,0.7365,0.3055,0.37,9,4100 +F,0.575,0.46,0.165,1.065,0.4985,0.2145,0.2815,8,3454 +F,0.395,0.3,0.105,0.3375,0.1435,0.0755,0.098,12,3323 +M,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,18,32 +I,0.52,0.395,0.125,0.5805,0.2445,0.146,0.165,9,1864 +I,0.585,0.475,0.16,1.0505,0.48,0.234,0.285,10,1342 +M,0.5,0.375,0.15,0.636,0.2535,0.145,0.19,10,690 +I,0.51,0.395,0.155,0.5395,0.2465,0.1085,0.167,8,2650 +I,0.315,0.23,0.08,0.1375,0.0545,0.031,0.0445,5,1217 +F,0.47,0.355,0.13,0.5465,0.2005,0.126,0.185,14,564 +M,0.58,0.47,0.165,1.041,0.54,0.166,0.279,9,3570 +F,0.55,0.425,0.135,0.8515,0.362,0.196,0.27,14,41 +F,0.47,0.36,0.13,0.472,0.182,0.114,0.15,10,304 +I,0.505,0.39,0.15,0.685,0.362,0.131,0.156,8,962 +F,0.55,0.44,0.135,0.8435,0.434,0.1995,0.185,8,2659 +I,0.45,0.345,0.135,0.443,0.1975,0.0875,0.1175,14,571 +I,0.44,0.355,0.165,0.435,0.159,0.105,0.14,16,2402 +M,0.4,0.32,0.095,0.303,0.1335,0.06,0.1,7,51 +I,0.295,0.225,0.09,0.1105,0.0405,0.0245,0.032,7,709 +I,0.445,0.355,0.095,0.3615,0.1415,0.0785,0.12,8,3540 +I,0.47,0.345,0.14,0.4615,0.229,0.1105,0.116,9,1452 +M,0.635,0.525,0.205,1.484,0.55,0.3115,0.43,20,278 +I,0.415,0.315,0.1,0.3645,0.1765,0.0795,0.095,8,2632 +I,0.435,0.335,0.11,0.383,0.1555,0.0675,0.135,12,2374 +F,0.525,0.415,0.15,0.7155,0.2355,0.171,0.27,13,3949 +I,0.55,0.445,0.145,0.783,0.3045,0.157,0.265,11,3036 +F,0.57,0.46,0.17,1.1,0.4125,0.2205,0.38,14,2252 +M,0.515,0.4,0.14,0.6335,0.288,0.145,0.168,9,2020 +F,0.525,0.405,0.115,0.72,0.3105,0.1915,0.2,14,3192 +F,0.565,0.4,0.13,0.6975,0.3075,0.1665,0.18,8,983 +M,0.675,0.515,0.145,1.265,0.6025,0.299,0.325,10,3596 +F,0.37,0.29,0.115,0.25,0.111,0.057,0.075,9,591 +F,0.475,0.365,0.13,0.4805,0.1905,0.114,0.1475,12,2422 +F,0.55,0.415,0.18,1.1655,0.502,0.301,0.311,9,3731 +M,0.6,0.475,0.19,1.0875,0.403,0.2655,0.325,14,336 +F,0.44,0.34,0.14,0.482,0.186,0.1085,0.16,9,205 +I,0.315,0.23,0.0,0.134,0.0575,0.0285,0.3505,6,3996 +I,0.325,0.24,0.07,0.152,0.0565,0.0305,0.054,8,2041 +I,0.47,0.345,0.115,0.4885,0.2005,0.108,0.166,11,603 +F,0.505,0.375,0.18,0.568,0.2325,0.1495,0.17,12,343 +M,0.635,0.49,0.16,1.101,0.534,0.1865,0.3455,10,1389 +M,0.535,0.41,0.135,0.862,0.2855,0.1525,0.32,14,738 +F,0.595,0.435,0.15,0.9,0.4175,0.17,0.265,8,1651 +M,0.515,0.4,0.16,0.8175,0.2515,0.156,0.3,23,2436 +M,0.455,0.35,0.11,0.458,0.2,0.111,0.1305,8,3089 +I,0.42,0.315,0.115,0.355,0.1895,0.065,0.087,6,2047 +M,0.465,0.34,0.105,0.486,0.231,0.1035,0.1225,9,2571 +M,0.72,0.565,0.2,2.1055,1.017,0.363,0.494,12,1527 +F,0.54,0.415,0.15,0.8115,0.3875,0.1875,0.2035,9,2833 +F,0.655,0.455,0.17,1.275,0.583,0.303,0.333,8,3621 +M,0.675,0.525,0.185,1.587,0.6935,0.336,0.395,13,356 +F,0.555,0.43,0.135,0.812,0.4055,0.163,0.2215,9,3494 +M,0.41,0.3,0.1,0.301,0.124,0.069,0.09,9,3362 +I,0.4,0.31,0.1,0.2875,0.1145,0.0635,0.095,10,2320 +I,0.32,0.215,0.095,0.305,0.14,0.067,0.0885,6,2975 +I,0.27,0.205,0.05,0.084,0.03,0.0185,0.029,6,3629 +F,0.625,0.5,0.15,0.953,0.3445,0.2235,0.305,15,495 +M,0.59,0.47,0.15,0.9955,0.481,0.232,0.24,8,1152 +M,0.59,0.465,0.14,1.046,0.4695,0.263,0.263,7,2592 +F,0.54,0.42,0.14,0.805,0.369,0.1725,0.21,11,846 +I,0.28,0.2,0.075,0.1225,0.0545,0.0115,0.035,5,2153 +M,0.575,0.47,0.185,0.985,0.3745,0.2175,0.355,10,1636 +M,0.72,0.6,0.235,2.2385,0.984,0.411,0.621,12,3993 +M,0.655,0.53,0.195,1.388,0.567,0.2735,0.41,13,467 diff --git a/model_docker_images/tests/example_model_script.py b/model_docker_images/tests/example_model_script.py new file mode 100644 index 000000000..11a1d0767 --- /dev/null +++ b/model_docker_images/tests/example_model_script.py @@ -0,0 +1,382 @@ +# Template Placeholders +TEMPLATE_PARAMS = { + "model_type": "regressor", + "target_column": "class_number_of_rings", + "feature_list": [ + "length", + "diameter", + "height", + "whole_weight", + "shucked_weight", + "viscera_weight", + "shell_weight", + "auto_id", + ], + "model_metrics_s3_path": "s3://sandbox-sageworks-artifacts/models/training/abalone-regression", + "train_all_data": False, +} + +# Imports for XGB Model +import xgboost as xgb +import awswrangler as wr + +# Model Performance Scores +from sklearn.metrics import ( + mean_absolute_error, + r2_score, + root_mean_squared_error, + precision_recall_fscore_support, + confusion_matrix, +) + +# Classification Encoder +from sklearn.preprocessing import LabelEncoder + +# Scikit Learn Imports +from sklearn.model_selection import train_test_split + +from io import StringIO +import json +import argparse +import joblib +import os +import pandas as pd +from typing import List + + +# Function to check if dataframe is empty +def check_dataframe(df: pd.DataFrame, df_name: str) -> None: + """ + Check if the provided dataframe is empty and raise an exception if it is. + + Args: + df (pd.DataFrame): DataFrame to check + df_name (str): Name of the DataFrame + """ + if df.empty: + msg = f"*** The training data {df_name} has 0 rows! ***STOPPING***" + print(msg) + raise ValueError(msg) + + +def expand_proba_column(df: pd.DataFrame, class_labels: List[str]) -> pd.DataFrame: + """ + Expands a column in a DataFrame containing a list of probabilities into separate columns. + + Args: + df (pd.DataFrame): DataFrame containing a "pred_proba" column + class_labels (List[str]): List of class labels + + Returns: + pd.DataFrame: DataFrame with the "pred_proba" expanded into separate columns + """ + + # Sanity check + proba_column = "pred_proba" + if proba_column not in df.columns: + raise ValueError('DataFrame does not contain a "pred_proba" column') + + # Construct new column names with '_proba' suffix + new_col_names = [f"{label}_proba" for label in class_labels] + + # Expand the proba_column into separate columns for each probability + proba_df = pd.DataFrame(df[proba_column].tolist(), columns=new_col_names) + + # Drop the original proba_column and reset the index in prep for the concat + df = df.drop(columns=[proba_column]) + df = df.reset_index(drop=True) + + # Concatenate the new columns with the original DataFrame + df = pd.concat([df, proba_df], axis=1) + print(df) + return df + + +def match_features_case_insensitive(df: pd.DataFrame, model_features: list) -> pd.DataFrame: + """ + Matches and renames the DataFrame's column names to match the model's feature names (case-insensitive). + Prioritizes exact case matches first, then falls back to case-insensitive matching if no exact match exists. + + Args: + df (pd.DataFrame): The DataFrame with the original columns. + model_features (list): The desired list of feature names (mixed case allowed). + + Returns: + pd.DataFrame: The DataFrame with renamed columns to match the model's feature names. + """ + # Create a mapping for exact and case-insensitive matching + exact_match_set = set(df.columns) + column_map = {} + + # Build the case-insensitive map (if we have any duplicate columns, the first one wins) + for col in df.columns: + lower_col = col.lower() + if lower_col not in column_map: + column_map[lower_col] = col + + # Create a dictionary for renaming + rename_dict = {} + for feature in model_features: + # Check for an exact match first + if feature in exact_match_set: + rename_dict[feature] = feature + + # If not an exact match, fall back to case-insensitive matching + elif feature.lower() in column_map: + rename_dict[column_map[feature.lower()]] = feature + + # Rename the columns in the DataFrame to match the model's feature names + return df.rename(columns=rename_dict) + + +if __name__ == "__main__": + """The main function is for training the XGBoost model""" + + # Harness Template Parameters + target = TEMPLATE_PARAMS["target_column"] + feature_list = TEMPLATE_PARAMS["feature_list"] + model_type = TEMPLATE_PARAMS["model_type"] + model_metrics_s3_path = TEMPLATE_PARAMS["model_metrics_s3_path"] + train_all_data = TEMPLATE_PARAMS["train_all_data"] + validation_split = 0.2 + + # Script arguments for input/output directories + parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) + args = parser.parse_args() + + # Read the training data into DataFrames + training_files = [os.path.join(args.train, file) for file in os.listdir(args.train) if file.endswith(".csv")] + print(f"Training Files: {training_files}") + + # Combine files and read them all into a single pandas dataframe + all_df = pd.concat([pd.read_csv(file, engine="python") for file in training_files]) + + # Check if the dataframe is empty + check_dataframe(all_df, "training_df") + + # Features/Target output + print(f"Target: {target}") + print(f"Features: {str(feature_list)}") + + # Do we want to train on all the data? + if train_all_data: + print("Training on ALL of the data") + df_train = all_df.copy() + df_val = all_df.copy() + + # Does the dataframe have a training column? + elif "training" in all_df.columns: + print("Found training column, splitting data based on training column") + df_train = all_df[all_df["training"]].copy() + df_val = all_df[~all_df["training"]].copy() + else: + # Just do a random training Split + print("WARNING: No training column found, splitting data with random state=42") + df_train, df_val = train_test_split(all_df, test_size=validation_split, random_state=42) + print(f"FIT/TRAIN: {df_train.shape}") + print(f"VALIDATION: {df_val.shape}") + + # Now spin up our XGB Model + if model_type == "classifier": + xgb_model = xgb.XGBClassifier() + + # Encode the target column + label_encoder = LabelEncoder() + df_train[target] = label_encoder.fit_transform(df_train[target]) + df_val[target] = label_encoder.transform(df_val[target]) + + else: + xgb_model = xgb.XGBRegressor() + label_encoder = None # We don't need this for regression + + # Grab our Features, Target and Train the Model + y = df_train[target] + X = df_train[feature_list] + xgb_model.fit(X, y) + + # Make Predictions on the Validation Set + print(f"Making Predictions on Validation Set...") + preds = xgb_model.predict(df_val[feature_list]) + if model_type == "classifier": + # Also get the probabilities for each class + print("Processing Probabilities...") + probs = xgb_model.predict_proba(df_val[feature_list]) + df_val["pred_proba"] = [p.tolist() for p in probs] + + # Expand the pred_proba column into separate columns for each class + print(df_val.columns) + df_val = expand_proba_column(df_val, label_encoder.classes_) + print(df_val.columns) + + # Decode the target and prediction labels + df_val[target] = label_encoder.inverse_transform(df_val[target]) + preds = label_encoder.inverse_transform(preds) + + # Save predictions to S3 (just the target, prediction, and '_proba' columns) + # Note: Skipping this for our test script + """ + df_val["prediction"] = preds + output_columns = [target, "prediction"] + output_columns += [col for col in df_val.columns if col.endswith("_proba")] + wr.s3.to_csv( + df_val[output_columns], + path=f"{model_metrics_s3_path}/validation_predictions.csv", + index=False, + ) + """ + + # Report Performance Metrics + if model_type == "classifier": + # Get the label names and their integer mapping + label_names = label_encoder.classes_ + + # Calculate various model performance metrics + scores = precision_recall_fscore_support(df_val[target], preds, average=None, labels=label_names) + + # Put the scores into a dataframe + score_df = pd.DataFrame( + { + target: label_names, + "precision": scores[0], + "recall": scores[1], + "fscore": scores[2], + "support": scores[3], + } + ) + + # We need to get creative with the Classification Metrics + metrics = ["precision", "recall", "fscore", "support"] + for t in label_names: + for m in metrics: + value = score_df.loc[score_df[target] == t, m].iloc[0] + print(f"Metrics:{t}:{m} {value}") + + # Compute and output the confusion matrix + conf_mtx = confusion_matrix(df_val[target], preds, labels=label_names) + for i, row_name in enumerate(label_names): + for j, col_name in enumerate(label_names): + value = conf_mtx[i, j] + print(f"ConfusionMatrix:{row_name}:{col_name} {value}") + + else: + # Calculate various model performance metrics (regression) + rmse = root_mean_squared_error(df_val[target], preds) + mae = mean_absolute_error(df_val[target], preds) + r2 = r2_score(df_val[target], preds) + print(f"RMSE: {rmse:.3f}") + print(f"MAE: {mae:.3f}") + print(f"R2: {r2:.3f}") + print(f"NumRows: {len(df_val)}") + + # Now save the model to the standard place/name + xgb_model.save_model(os.path.join(args.model_dir, "xgb_model.json")) + if label_encoder: + joblib.dump(label_encoder, os.path.join(args.model_dir, "label_encoder.joblib")) + + # Also save the features (this will validate input during predictions) + with open(os.path.join(args.model_dir, "feature_columns.json"), "w") as fp: + json.dump(feature_list, fp) + + +def model_fn(model_dir): + """Deserialized and return fitted model""" + + # Load our XGBoost model from the model directory + model_path = os.path.join(model_dir, "xgb_model.json") + with open(model_path, "r") as f: + model_json = json.load(f) + saved_model_type = json.loads(model_json.get("learner").get("attributes").get("scikit_learn")).get( + "_estimator_type" + ) + if saved_model_type == "classifier": + model = xgb.XGBClassifier() + elif saved_model_type == "regressor": + model = xgb.XGBRegressor() + else: + msg = f"Model type ({saved_model_type}) not recognized. Expected 'classifier' or 'regressor'" + raise ValueError(msg) + + model.load_model(model_path) + return model + + +def input_fn(input_data, content_type): + """Parse input data and return a DataFrame.""" + if not input_data: + raise ValueError("Empty input data is not supported!") + + # Decode bytes to string if necessary + if isinstance(input_data, bytes): + input_data = input_data.decode("utf-8") + + if "text/csv" in content_type: + return pd.read_csv(StringIO(input_data)) + elif "application/json" in content_type: + return pd.DataFrame(json.loads(input_data)) # Assumes JSON array of records + else: + raise ValueError(f"{content_type} not supported!") + + +def output_fn(output_df, accept_type): + """Supports both CSV and JSON output formats.""" + if "text/csv" in accept_type: + csv_output = output_df.fillna("N/A").to_csv(index=False) # CSV with N/A for missing values + return csv_output, "text/csv" + elif "application/json" in accept_type: + return output_df.to_json(orient="records"), "application/json" # JSON array of records (NaNs -> null) + else: + raise RuntimeError(f"{accept_type} accept type is not supported by this script.") + + +def predict_fn(df, model) -> pd.DataFrame: + """Make Predictions with our XGB Model + + Args: + df (pd.DataFrame): The input DataFrame + model: The model use for predictions + + Returns: + pd.DataFrame: The DataFrame with the predictions added + """ + + # Grab our feature columns (from training) + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") + with open(os.path.join(model_dir, "feature_columns.json")) as fp: + model_features = json.load(fp) + print(f"Model Features: {model_features}") + + # Load our Label Encoder if we have one + label_encoder = None + if os.path.exists(os.path.join(model_dir, "label_encoder.joblib")): + label_encoder = joblib.load(os.path.join(model_dir, "label_encoder.joblib")) + + # We're going match features in a case-insensitive manner, accounting for all the permutations + # - Model has a feature list that's any case ("Id", "taCos", "cOunT", "likes_tacos") + # - Incoming data has columns that are mixed case ("ID", "Tacos", "Count", "Likes_Tacos") + matched_df = match_features_case_insensitive(df, model_features) + + # Predict the features against our XGB Model + predictions = model.predict(matched_df[model_features]) + + # If we have a label encoder, decode the predictions + if label_encoder: + predictions = label_encoder.inverse_transform(predictions) + + # Set the predictions on the DataFrame + df["prediction"] = predictions + + # Does our model have a 'predict_proba' method? If so we will call it and add the results to the DataFrame + if getattr(model, "predict_proba", None): + probs = model.predict_proba(matched_df[model_features]) + df["pred_proba"] = [p.tolist() for p in probs] + + # Expand the pred_proba column into separate columns for each class + df = expand_proba_column(df, label_encoder.classes_) + + # All done, return the DataFrame with new columns for the predictions + return df diff --git a/model_docker_images/tests/run_tests.sh b/model_docker_images/tests/run_tests.sh new file mode 100644 index 000000000..65cbd1514 --- /dev/null +++ b/model_docker_images/tests/run_tests.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +# Get the project root directory +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# Configuration +TRAINING_IMAGE="aws_model_training" +INFERENCE_IMAGE="aws_model_inference" +IMAGE_VERSION=${1:-"0.1"} + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Test training container +echo "======================================" +echo "๐Ÿงช Testing training container" +echo "======================================" +python "$SCRIPT_DIR/test_training.py" --image "${TRAINING_IMAGE}:${IMAGE_VERSION}" + +# Test inference container +echo "======================================" +echo "๐Ÿงช Testing inference container" +echo "======================================" + +# Start the inference container in the background +echo "Starting inference container..." +CONTAINER_ID=$(docker run -d -p 8080:8080 "${INFERENCE_IMAGE}:${IMAGE_VERSION}") + +# Wait for the container to initialize +echo "Waiting for server to initialize (5 seconds)..." +sleep 5 + +# Run the test +python "$SCRIPT_DIR/test_inference.py" + +# Stop and remove the container +echo "Stopping inference container..." +docker stop $CONTAINER_ID +docker rm $CONTAINER_ID + +echo "======================================" +echo -e "${GREEN}โœ… Testing completed!${NC}" +echo "======================================" \ No newline at end of file diff --git a/model_docker_images/tests/test_inference.py b/model_docker_images/tests/test_inference.py new file mode 100644 index 000000000..8520f1b6e --- /dev/null +++ b/model_docker_images/tests/test_inference.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python +import os +import json +import time +import argparse +import tempfile +import shutil +import subprocess +import requests +import pandas as pd +import numpy as np +from io import StringIO + + +class MockModel: + """Mock SageMaker Model class that simulates the behavior of sagemaker.model.Model""" + + def __init__(self, image_uri, model_data=None, role=None, **kwargs): + """ + Initialize a MockModel with parameters similar to a SageMaker Model. + + Args: + image_uri (str): The Docker image URI to use for inference + model_data (str): Path to model artifacts (S3 URI or local path) + role (str): AWS IAM role (not used in mock) + """ + self.image_uri = image_uri + self.model_data = model_data + self.role = role + self.kwargs = kwargs + self.temp_dir = None + self.container_id = None + self.endpoint_url = None + + def register(self, content_types=None, response_types=None, **kwargs): + """Mock model registration - just stores the parameters""" + self.content_types = content_types or ["application/json"] + self.response_types = response_types or ["application/json"] + for key, value in kwargs.items(): + setattr(self, key, value) + print(f"Mock registered model with content types: {self.content_types}") + return self + + def deploy(self, instance_type=None, initial_instance_count=1, endpoint_name=None): + """ + Deploy the model to a mock endpoint (local Docker container). + + Args: + instance_type (str): SageMaker instance type (ignored) + initial_instance_count (int): Number of instances (ignored) + endpoint_name (str): Endpoint name for identification + + Returns: + MockEndpoint: The deployed endpoint + """ + print(f"Deploying model to endpoint: {endpoint_name or 'default-endpoint'}") + + # Create a temp directory for model data if not provided + if self.model_data is None: + self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-inference-test-") + model_dir = self.temp_dir + + # Create a dummy model + print(f"Creating dummy model in {model_dir}") + import joblib + import xgboost as xgb + + # Train a simple model + model = xgb.XGBRegressor(objective="reg:squarederror") + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + y = np.array([10, 20, 30]) + model.fit(X, y) + + # Save the model + joblib.dump(model, os.path.join(model_dir, "model.joblib")) + + # Save metadata + with open(os.path.join(model_dir, "metadata.json"), "w") as f: + json.dump({"feature_names": ["feature1", "feature2", "feature3"], "model_type": "regression"}, f) + + self.model_data = model_dir + else: + # Use provided model_data + model_dir = self.model_data + + # Start the container + cmd = [ + "docker", + "run", + "-d", + "--rm", + "-p", + "8080:8080", + "-v", + f"{model_dir}:/opt/ml/model", + "-e", + "MODEL_PATH=/opt/ml/model", + ] + + # Add platform flag for Mac M1/M2/M3 users + if os.uname().machine == "arm64": + cmd.insert(2, "--platform") + cmd.insert(3, "linux/amd64") + + # Add the image URI + cmd.append(self.image_uri) + print(f"Starting inference container: {' '.join(cmd)}") + self.container_id = subprocess.check_output(cmd).decode("utf-8").strip() + + # Add this block immediately after starting the container + print(f"Container ID: {self.container_id}") + try: + # Give it a moment to start or fail + time.sleep(1) + + # Get container logs + logs = subprocess.check_output(["docker", "logs", self.container_id], stderr=subprocess.STDOUT).decode( + "utf-8" + ) + print(f"Container startup logs:\n{logs}") + except Exception as e: + print(f"Error getting container logs: {e}") + + self.endpoint_url = "http://localhost:8080" + return MockEndpoint(self) + + +class MockEndpoint: + """Mock SageMaker Endpoint for local testing""" + + def __init__(self, model): + """Initialize with a reference to the model""" + self.model = model + self.url = model.endpoint_url + + # Check container status and logs + try: + # Get container state + inspect_output = ( + subprocess.check_output(["docker", "inspect", "--format", "{{.State.Status}}", model.container_id]) + .decode("utf-8") + .strip() + ) + + print(f"Container status: {inspect_output}") + + # If not running, get the logs + if inspect_output != "running": + logs = subprocess.check_output(["docker", "logs", model.container_id], stderr=subprocess.STDOUT).decode( + "utf-8" + ) + print(f"Container logs:\n{logs}") + raise RuntimeError("Container failed to start properly") + except Exception as e: + print(f"Error checking container: {e}") + + def predict(self, data, initial_args=None): + """ + Makes a prediction using the deployed model. + + Args: + data: Input data in format matching content_types + initial_args: Additional arguments (ignored) + + Returns: + The prediction result + """ + # Default to first registered content type + content_type = self.model.content_types[0] if hasattr(self.model, "content_types") else "application/json" + + # Format the data according to content type + if content_type == "text/csv": + if isinstance(data, pd.DataFrame): + payload = data.to_csv(header=False, index=False) + elif isinstance(data, (list, np.ndarray)): + payload = pd.DataFrame(data).to_csv(header=False, index=False) + else: + payload = str(data) + else: + # Default to JSON + if isinstance(data, pd.DataFrame): + payload = data.to_json(orient="records") + elif isinstance(data, (list, np.ndarray)): + payload = json.dumps({"instances": data.tolist() if hasattr(data, "tolist") else data}) + else: + payload = json.dumps(data) + + # Send the request to the container + try: + response = requests.post(f"{self.url}/invocations", data=payload, headers={"Content-Type": content_type}) + + # Check for errors + if response.status_code != 200: + raise Exception(f"Prediction failed with status code {response.status_code}: {response.text}") + + # Parse response based on response type + if hasattr(self.model, "response_types") and "text/csv" in self.model.response_types: + # Parse CSV response + return pd.read_csv(StringIO(response.text), header=None) + else: + # Parse JSON response + return response.json() + + except Exception as e: + print(f"Error during prediction: {e}") + raise + + def delete_endpoint(self): + """Clean up resources by stopping the container""" + print(f"Deleting endpoint (stopping container {self.model.container_id})") + if self.model.container_id: + try: + subprocess.run(["docker", "stop", self.model.container_id], check=False) + except Exception as e: + print(f"Error stopping container: {e}") + + # Clean up temp directory if needed + if self.model.temp_dir and os.path.exists(self.model.temp_dir): + print(f"Cleaning up temporary directory: {self.model.temp_dir}") + shutil.rmtree(self.model.temp_dir) + self.model.temp_dir = None + + +def test_csv_inference(endpoint, test_data=None): + """Test inference with CSV data""" + print("\nTesting CSV inference...") + + if test_data is None: + # Create sample test data + test_data = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + + try: + response = endpoint.predict(test_data) + print(f"Prediction response: {response}") + print("โœ… CSV inference test successful") + return True + except Exception as e: + print(f"โŒ CSV inference test failed: {e}") + return False + + +def test_json_inference(endpoint, test_data=None): + """Test inference with JSON data""" + print("\nTesting JSON inference...") + + if test_data is None: + # Create sample test data - use list of lists of floats + test_data = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] + + try: + response = endpoint.predict(test_data) + print(f"Prediction response: {response}") + print("โœ… JSON inference test successful") + return True + except Exception as e: + print(f"โŒ JSON inference test failed: {e}") + return False + + +def test_ping_endpoint(url): + """Test the /ping endpoint directly""" + print("\nTesting /ping endpoint...") + try: + response = requests.get(f"{url}/ping") + print(f"Response status: {response.status_code}") + if response.status_code == 200: + print("โœ… Ping test successful") + return True + else: + print(f"โŒ Ping test failed with status {response.status_code}") + return False + except Exception as e: + print(f"โŒ Ping test error: {e}") + return False + + +def main(): + """Run the test using MockModel and MockEndpoint""" + parser = argparse.ArgumentParser(description="Test SageMaker inference container") + parser.add_argument( + "--image", type=str, default="aws-ml-images/py312-sklearn-xgb-inference:0.1", help="Inference image name:tag" + ) + parser.add_argument("--model-dir", type=str, default=None, help="Path to model directory (optional)") + args = parser.parse_args() + + print(f"Testing inference container {args.image}") + + # Create the model and endpoint + model = None + endpoint = None + success = False + + try: + # Create and deploy the model + model = MockModel(image_uri=args.image, model_data=args.model_dir, role="mock-role") + + # Register the model + model.register( + content_types=["text/csv", "application/json"], + response_types=["text/csv", "application/json"], + inference_instances=["ml.t2.medium"], + transform_instances=["ml.m5.large"], + description="Test model", + ) + + # Deploy the model + endpoint = model.deploy(instance_type="local", initial_instance_count=1, endpoint_name="test-endpoint") + + # Test the /ping endpoint + ping_success = test_ping_endpoint(endpoint.url) + + # Test predictions + csv_success = test_csv_inference(endpoint) + json_success = test_json_inference(endpoint) + + # Overall success + success = ping_success and csv_success and json_success + + if success: + print("\nโœ… All inference tests passed successfully!") + else: + print("\nโŒ Some inference tests failed!") + + except Exception as e: + print(f"\nโŒ Error during inference testing: {e}") + finally: + # Clean up resources + if endpoint: + endpoint.delete_endpoint() + + # Return appropriate exit code + return 0 if success else 1 + + +if __name__ == "__main__": + exit(main()) diff --git a/model_docker_images/tests/test_training.py b/model_docker_images/tests/test_training.py new file mode 100644 index 000000000..f6c64c3ff --- /dev/null +++ b/model_docker_images/tests/test_training.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python +import os +import json +import shutil +import argparse +import subprocess +import tempfile +import time +from pathlib import Path + + +class MockEstimator: + """Mock SageMaker Estimator for local container testing""" + + def __init__(self, image_uri, entry_point=None, source_dir=None, hyperparameters=None, **kwargs): + self.image_uri = image_uri + self.entry_point = entry_point + self.source_dir = source_dir + self.hyperparameters = hyperparameters or {} + self.temp_dir = None + self.model_data = None + + def fit(self, inputs, job_name=None, logs=True): + """Train the model using the input data""" + print(f"Starting mock training job: {job_name or 'unnamed-job'}") + + try: + # Set up SageMaker directory structure + self.temp_dir = tempfile.mkdtemp(prefix="sagemaker-test-") + print(f"Created test environment at: {self.temp_dir}") + + # Create directories + for path in ["input/data/train", "input/config", "model", "output/data", "code"]: + os.makedirs(f"{self.temp_dir}/{path}", exist_ok=True) + + # Copy data files + for channel_name, channel_data in inputs.items(): + channel_dir = f"{self.temp_dir}/input/data/{channel_name}" + os.makedirs(channel_dir, exist_ok=True) + + if os.path.isfile(channel_data): + shutil.copy2(channel_data, channel_dir) + print(f"Copied data: {os.path.basename(channel_data)} to {channel_name} channel") + elif os.path.isdir(channel_data): + for file in os.listdir(channel_data): + if file.endswith(".csv"): + shutil.copy2(os.path.join(channel_data, file), channel_dir) + + # Copy source files to code directory + if self.source_dir and os.path.exists(self.source_dir): + for file in os.listdir(self.source_dir): + if file.endswith(".py"): + shutil.copy2(os.path.join(self.source_dir, file), f"{self.temp_dir}/code") + print(f"Copied source files to code directory") + + # Create hyperparameters.json + all_hyperparams = { + **self.hyperparameters, + "sagemaker_program": self.entry_point, + "sagemaker_submit_directory": "/opt/ml/code", + } + + with open(f"{self.temp_dir}/input/config/hyperparameters.json", "w") as f: + json.dump(all_hyperparams, f) + + # Run the container + cmd = [ + "docker", + "run", + "--rm", + "-v", + f"{self.temp_dir}/input:/opt/ml/input", + "-v", + f"{self.temp_dir}/model:/opt/ml/model", + "-v", + f"{self.temp_dir}/output:/opt/ml/output", + "-v", + f"{self.temp_dir}/code:/opt/ml/code", + "-e", + f"SAGEMAKER_PROGRAM={self.entry_point}", + "-e", + "SM_MODEL_DIR=/opt/ml/model", + "-e", + "SM_OUTPUT_DATA_DIR=/opt/ml/output/data", + "-e", + "SM_CHANNEL_TRAIN=/opt/ml/input/data/train", + self.image_uri, + ] + + # Add platform flag for Mac M1/M2/M3 users + if os.uname().machine == "arm64": + cmd.insert(2, "--platform") + cmd.insert(3, "linux/amd64") + + print(f"Running training container...") + + start_time = time.time() + result = subprocess.run(cmd, check=True, capture_output=not logs) + training_time = time.time() - start_time + print(f"Training completed in {training_time:.2f} seconds") + + # Check output + model_files = os.listdir(f"{self.temp_dir}/model") + if model_files: + print(f"โœ… Model created successfully with files: {', '.join(model_files)}") + else: + print("โš ๏ธ No model files were created during training") + + return self + + except subprocess.CalledProcessError as e: + print(f"โŒ Training failed with exit code {e.returncode}") + if e.stdout: + print(f"STDOUT: {e.stdout.decode('utf-8')}") + if e.stderr: + print(f"STDERR: {e.stderr.decode('utf-8')}") + raise + except Exception as e: + print(f"โŒ Error during training: {e}") + raise + + def cleanup(self): + """Remove temporary directories""" + if self.temp_dir and os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + self.temp_dir = None + + +def main(): + """Run the test using a MockEstimator""" + parser = argparse.ArgumentParser(description="Test SageMaker training container") + parser.add_argument( + "--image", type=str, default="aws-ml-images/py312-sklearn-xgb-training:0.1", help="Training image name:tag" + ) + parser.add_argument("--entry-point", type=str, default="example_model_script.py", help="Training script name") + parser.add_argument("--source-dir", type=str, default="tests/", help="Directory containing training scripts") + parser.add_argument("--data", type=str, default="tests/data/abalone_sm.csv", help="Training data path") + args = parser.parse_args() + + # Resolve relative paths + script_dir = Path(__file__).parent.absolute() + project_root = script_dir.parent + + source_dir = os.path.join(project_root, args.source_dir) if not os.path.isabs(args.source_dir) else args.source_dir + data_path = os.path.join(project_root, args.data) if not os.path.isabs(args.data) else args.data + + print(f"Testing with image {args.image}, script {args.entry_point}") + + # Create and run the estimator + estimator = MockEstimator(image_uri=args.image, entry_point=args.entry_point, source_dir=source_dir) + + try: + estimator.fit(inputs={"train": data_path}, job_name="mock-training-job") + print("โœ… Training completed successfully") + except Exception as e: + print(f"โŒ Training failed: {e}") + raise + finally: + estimator.cleanup() + + +if __name__ == "__main__": + main() diff --git a/model_docker_images/training/Dockerfile b/model_docker_images/training/Dockerfile new file mode 100644 index 000000000..02f1c96cd --- /dev/null +++ b/model_docker_images/training/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.12-slim + +# Install Vim +RUN apt-get update && apt-get install -y vim + +# Copy requirements file +COPY requirements.txt /tmp/ + +# Install dependencies +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +# Copy the SageMaker entrypoint script +COPY sagemaker_entrypoint.py /opt/program/ +WORKDIR /opt/program + +# Make the entrypoint executable +RUN chmod +x /opt/program/sagemaker_entrypoint.py + +# Set the entrypoint +ENTRYPOINT ["/opt/program/sagemaker_entrypoint.py"] \ No newline at end of file diff --git a/model_docker_images/training/requirements.txt b/model_docker_images/training/requirements.txt new file mode 100644 index 000000000..b3b7b18dd --- /dev/null +++ b/model_docker_images/training/requirements.txt @@ -0,0 +1,5 @@ +scikit-learn==1.6.1 +xgboost-cpu==2.1.4 +pandas==2.2.3 +awswrangler==3.11.0 +joblib==1.4.2 \ No newline at end of file diff --git a/model_docker_images/training/sagemaker_entrypoint.py b/model_docker_images/training/sagemaker_entrypoint.py new file mode 100644 index 000000000..ee70e355b --- /dev/null +++ b/model_docker_images/training/sagemaker_entrypoint.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +import os +import sys +import shutil +import json +import tarfile +import subprocess +import logging +import boto3 +from urllib.parse import urlparse + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def download_and_extract_s3(s3_uri, target_dir="/opt/ml/code"): + """Download and extract code package from S3.""" + logger.info(f"Downloading source package from {s3_uri}...") + parsed = urlparse(s3_uri) + bucket = parsed.netloc + key = parsed.path.lstrip("/") + local_tar = "/tmp/code_package.tar.gz" + + try: + s3 = boto3.client("s3") + s3.download_file(bucket, key, local_tar) + logger.info(f"Download successful: {os.path.getsize(local_tar)} bytes") + + os.makedirs(target_dir, exist_ok=True) + with tarfile.open(local_tar, "r:gz") as tar: + tar.extractall(path=target_dir, numeric_owner=True) + return target_dir + except Exception as e: + logger.error(f"Error downloading from S3: {e}") + sys.exit(1) + + +def install_requirements(requirements_path): + """Install Python dependencies from requirements file.""" + if os.path.exists(requirements_path): + logger.info(f"Installing dependencies from {requirements_path}...") + try: + subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_path]) + logger.info("Requirements installed successfully.") + except subprocess.CalledProcessError as e: + logger.error(f"Error installing requirements: {e}") + sys.exit(1) + else: + logger.info(f"No requirements file found at {requirements_path}") + + +def include_code_and_meta_for_inference(model_dir, code_dir, entry_point): + """Include code and some metadata for the inference container""" + logger.info("Including code and metadata for inference...") + + # Create inference metadata file + inference_metadata = {"inference_script": entry_point} + + # Write metadata to model directory + metadata_path = os.path.join(model_dir, "inference-metadata.json") + with open(metadata_path, "w") as fp: + json.dump(inference_metadata, fp) + + # Copy code to model directory, copy ALL files and directories recursively (except __pycache__) + # Also list all files/directories that are being copied + for item in os.listdir(code_dir): + if item == "__pycache__": + continue + src, dst = os.path.join(code_dir, item), os.path.join(model_dir, item) + shutil.copytree(src, dst, dirs_exist_ok=True) if os.path.isdir(src) else shutil.copy2(src, dst) + logger.info(f"Copied: {src} -> {dst}") + + +def main(): + logger.info("Starting Workbench training container...") + + # Load hyperparameters + hyperparams_path = "/opt/ml/input/config/hyperparameters.json" + if not os.path.exists(hyperparams_path): + logger.error("hyperparameters.json not found!") + sys.exit(1) + + with open(hyperparams_path, "r") as f: + hyperparams = json.load(f) + logger.info(f"Hyperparameters: {hyperparams}") + + # Get program name from hyperparameters + if "sagemaker_program" in hyperparams: + training_script = hyperparams["sagemaker_program"].strip("\"'") + else: + logger.error("sagemaker_program not found in hyperparameters!") + sys.exit(1) + + logger.info(f"Using training_script: {training_script}") + + # Get source directory from hyperparameters + if "sagemaker_submit_directory" in hyperparams: + code_directory = hyperparams["sagemaker_submit_directory"].strip("\"'") + + # Handle S3 vs local path + if code_directory.startswith("s3://"): + code_directory = download_and_extract_s3(code_directory) + elif not os.path.exists(code_directory): + logger.error(f"Local code directory not found: {code_directory}") + sys.exit(1) + + # Install requirements if present + install_requirements(os.path.join(code_directory, "requirements.txt")) + + # Find training script + training_script_path = os.path.join(code_directory, training_script) + if not os.path.exists(training_script_path): + logger.error(f"Training script not found: {training_script_path}") + sys.exit(1) + + logger.info(f"Executing: {training_script_path}") + + # Add the code directory to the Python path + os.environ["PYTHONPATH"] = f"{code_directory}:{os.environ.get('PYTHONPATH', '')}" + + # Call the training script and then include code and meta for inference + try: + subprocess.check_call( + [ + sys.executable, + training_script_path, + "--model-dir", + os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), + "--output-data-dir", + os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data"), + "--train", + os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train"), + ] + ) + + # After training completes, include code and meta in the model.tar.gz + include_code_and_meta_for_inference( + model_dir=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"), + code_dir=code_directory, + entry_point=training_script, + ) + + except subprocess.CalledProcessError as e: + logger.error(f"Failed to execute training script: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index ef8494177..20a2aca0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,10 +35,10 @@ dependencies = [ "cryptography >= 42.0.5", "ipython >= 8.17.2", "pyreadline3; sys_platform == 'win32'", - "scikit-learn >=1.4.2, <= 1.5.2", + "scikit-learn >=1.5.2", "joblib >= 1.3.2", "requests >= 2.26.0", - "rdkit>=2024.3.2", + "rdkit>=2024.9.5", "mordredcommunity>=2.0.6", ] diff --git a/src/workbench/core/artifacts/model_core.py b/src/workbench/core/artifacts/model_core.py index fcbe791a7..c29d864ba 100644 --- a/src/workbench/core/artifacts/model_core.py +++ b/src/workbench/core/artifacts/model_core.py @@ -35,10 +35,23 @@ class ModelType(Enum): UNKNOWN = "unknown" -class InferenceImage: +class ModelImages: """Class for retrieving locked Scikit-Learn inference images""" image_uris = { + ("us-east-1", "training", "0.1"): ( + "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1" + ), + ("us-east-1", "inference", "0.1"): ( + "507740646243.dkr.ecr.us-east-1.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" + ), + ("us-west-2", "training", "0.1"): ( + "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-training:0.1" + ), + ("us-west-2", "inference", "0.1"): ( + "507740646243.dkr.ecr.us-west-2.amazonaws.com/aws-ml-images/py312-sklearn-xgb-inference:0.1" + ), + # These are the OLD locked SKLearn images ("us-east-1", "sklearn", "1.2.1"): ( "683313688378.dkr.ecr.us-east-1.amazonaws.com/" "sagemaker-scikit-learn@sha256:ed242e33af079f334972acd2a7ddf74d13310d3c9a0ef3a0e9b0429ccc104dcd" @@ -58,13 +71,13 @@ class InferenceImage: } @classmethod - def get_image_uri(cls, region, framework, version): - key = (region, framework, version) + def get_image_uri(cls, region, image_type="training", version="0.1"): + key = (region, image_type, version) if key in cls.image_uris: return cls.image_uris[key] else: raise ValueError( - f"No matching image found for region: {region}, framework: {framework}, version: {version}" + f"No matching image found for region: {region}, image_type: {image_type}, version: {version}" ) diff --git a/src/workbench/core/transforms/features_to_model/features_to_model.py b/src/workbench/core/transforms/features_to_model/features_to_model.py index 0fdc1c64d..e5916994f 100644 --- a/src/workbench/core/transforms/features_to_model/features_to_model.py +++ b/src/workbench/core/transforms/features_to_model/features_to_model.py @@ -1,14 +1,14 @@ """FeaturesToModel: Train/Create a Model from a Feature Set""" from pathlib import Path -from sagemaker.sklearn.estimator import SKLearn +from sagemaker.estimator import Estimator import awswrangler as wr from datetime import datetime, timezone # Local Imports from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput from workbench.core.artifacts.feature_set_core import FeatureSetCore -from workbench.core.artifacts.model_core import ModelCore, ModelType, InferenceImage +from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages from workbench.core.artifacts.artifact import Artifact from workbench.model_scripts.script_generation import generate_model_script from workbench.utils.model_utils import supported_instance_types @@ -111,6 +111,7 @@ def transform_impl( all_columns = feature_set.columns filter_list = [ "id", + "auto_id", "__index_level_0__", "write_time", "api_invocation_time", @@ -208,14 +209,14 @@ def transform_impl( source_dir = str(Path(script_path).parent) # Create a Sagemaker Model with our script - image = InferenceImage.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") - self.estimator = SKLearn( + image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "training", "0.1") + self.estimator = Estimator( entry_point=entry_point, source_dir=source_dir, role=self.workbench_role_arn, + instance_count=1, instance_type="ml.m5.large", sagemaker_session=self.sm_session, - framework_version="1.2-1", image_uri=image, metric_definitions=metric_definitions, ) @@ -268,12 +269,11 @@ def create_and_register_model(self): ) # Register our model - image = InferenceImage.get_image_uri(self.sm_session.boto_region_name, "sklearn", "1.2.1") + image = ModelImages.get_image_uri(self.sm_session.boto_region_name, "inference", "0.1") self.log.important(f"Registering model {self.output_uuid} with image {image}...") model = self.estimator.create_model(role=self.workbench_role_arn) model.register( model_package_group_name=self.output_uuid, - framework_version="1.2.1", image_uri=image, content_types=["text/csv"], response_types=["text/csv"], diff --git a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py index c71e81934..8a6c248b5 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +++ b/src/workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py @@ -22,10 +22,13 @@ # and save the model artifacts to the model directory. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # This model doesn't get trained, it just a feature creation 'model' diff --git a/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py b/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py index 4fede9442..a3889715a 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +++ b/src/workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py @@ -24,10 +24,13 @@ # and save the model artifacts to the model directory. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # This model doesn't get trained, it just a feature creation 'model' diff --git a/src/workbench/model_scripts/custom_models/chem_info/requirements.txt b/src/workbench/model_scripts/custom_models/chem_info/requirements.txt index 33ff11c23..68cb66c0f 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/requirements.txt +++ b/src/workbench/model_scripts/custom_models/chem_info/requirements.txt @@ -1,4 +1,2 @@ -scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework -awswrangler>=3.8.0 -rdkit>=2024.3.2 +rdkit>=2024.9.5 mordredcommunity>=2.0.6 \ No newline at end of file diff --git a/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py b/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py index 72c2afe34..16e479a61 100644 --- a/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py +++ b/src/workbench/model_scripts/custom_models/chem_info/tautomerize.py @@ -23,10 +23,13 @@ # This section (__main__) is where SageMaker will execute the job and save the model artifacts. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # This model doesn't get trained; it's a feature processing 'model' diff --git a/src/workbench/model_scripts/custom_script_example/custom_model_script.py b/src/workbench/model_scripts/custom_script_example/custom_model_script.py index 3e2a8db0a..c36d4ff15 100644 --- a/src/workbench/model_scripts/custom_script_example/custom_model_script.py +++ b/src/workbench/model_scripts/custom_script_example/custom_model_script.py @@ -48,10 +48,13 @@ def expand_proba_column(df: pd.DataFrame, class_labels: list) -> pd.DataFrame: # and save the model artifacts to the model directory. # if __name__ == "__main__": + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # Load the training data @@ -144,7 +147,7 @@ def output_fn(output_df, accept_type): # Prediction function def predict_fn(df, model): - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) diff --git a/src/workbench/model_scripts/custom_script_example/requirements.txt b/src/workbench/model_scripts/custom_script_example/requirements.txt index 2b1dd27fd..e69de29bb 100644 --- a/src/workbench/model_scripts/custom_script_example/requirements.txt +++ b/src/workbench/model_scripts/custom_script_example/requirements.txt @@ -1,2 +0,0 @@ -scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework -awswrangler>=3.8.0 diff --git a/src/workbench/model_scripts/light_quant_regression/quant_regression.template b/src/workbench/model_scripts/light_quant_regression/quant_regression.template index 8ea2a6e6d..f638c5f75 100644 --- a/src/workbench/model_scripts/light_quant_regression/quant_regression.template +++ b/src/workbench/model_scripts/light_quant_regression/quant_regression.template @@ -86,13 +86,13 @@ if __name__ == "__main__": quantiles = [0.05, 0.25, 0.50, 0.75, 0.95] q_models = {} - # Sagemaker specific arguments. Defaults are set in the environment variables. + # Script arguments for input/output directories parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) parser.add_argument( - "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"] + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") ) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() # Read the training data into DataFrames @@ -280,7 +280,7 @@ def predict_fn(df, models) -> pd.DataFrame: """ # Grab our feature columns (from training) - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) print(f"Model Features: {model_features}") diff --git a/src/workbench/model_scripts/light_quant_regression/requirements.txt b/src/workbench/model_scripts/light_quant_regression/requirements.txt index 25a034855..e69de29bb 100644 --- a/src/workbench/model_scripts/light_quant_regression/requirements.txt +++ b/src/workbench/model_scripts/light_quant_regression/requirements.txt @@ -1,2 +0,0 @@ -xgboost==2.0.3 -awswrangler==3.8.0 \ No newline at end of file diff --git a/src/workbench/model_scripts/light_scikit_learn/requirements.txt b/src/workbench/model_scripts/light_scikit_learn/requirements.txt index 2a1bb2a2a..cf1b0394e 100644 --- a/src/workbench/model_scripts/light_scikit_learn/requirements.txt +++ b/src/workbench/model_scripts/light_scikit_learn/requirements.txt @@ -1,3 +1 @@ -scikit-learn==1.3.2 # Note: This is the highest version that works with SageMaker/scikit-learn framework -umap-learn -awswrangler>=3.8.0 \ No newline at end of file +umap-learn \ No newline at end of file diff --git a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template index e98f752ce..f79565947 100644 --- a/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template +++ b/src/workbench/model_scripts/light_scikit_learn/scikit_learn.template @@ -89,11 +89,13 @@ if __name__ == "__main__": train_all_data = TEMPLATE_PARAMS["train_all_data"] validation_split = 0.2 - # SageMaker arguments for input/output directories + # Script arguments for input/output directories parser = argparse.ArgumentParser() - parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) + parser.add_argument( + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") + ) args = parser.parse_args() # Load training data from the specified directory @@ -242,7 +244,7 @@ def output_fn(output_df, accept_type): def predict_fn(df, model): """Make predictions or apply transformations using the model and return the DataFrame with results.""" - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") # Load feature columns from the saved file with open(os.path.join(model_dir, "feature_columns.json")) as fp: diff --git a/src/workbench/model_scripts/light_xgb_model/requirements.txt b/src/workbench/model_scripts/light_xgb_model/requirements.txt index 25a034855..7ff58e74d 100644 --- a/src/workbench/model_scripts/light_xgb_model/requirements.txt +++ b/src/workbench/model_scripts/light_xgb_model/requirements.txt @@ -1,2 +1,3 @@ -xgboost==2.0.3 -awswrangler==3.8.0 \ No newline at end of file +xgboost-cpu==2.1.4 +pandas==2.2.3 +awswrangler==3.11.0 \ No newline at end of file diff --git a/src/workbench/model_scripts/light_xgb_model/xgb_model.template b/src/workbench/model_scripts/light_xgb_model/xgb_model.template index a534b2164..f02fca231 100644 --- a/src/workbench/model_scripts/light_xgb_model/xgb_model.template +++ b/src/workbench/model_scripts/light_xgb_model/xgb_model.template @@ -15,7 +15,7 @@ import awswrangler as wr from sklearn.metrics import ( mean_absolute_error, r2_score, - mean_squared_error, + root_mean_squared_error, precision_recall_fscore_support, confusion_matrix, ) @@ -131,15 +131,16 @@ if __name__ == "__main__": train_all_data = TEMPLATE_PARAMS["train_all_data"] validation_split = 0.2 - # Sagemaker specific arguments. Defaults are set in the environment variables. + # Script arguments for input/output directories parser = argparse.ArgumentParser() + parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")) parser.add_argument( - "--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"] + "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "/opt/ml/output/data") ) - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) args = parser.parse_args() + # Read the training data into DataFrames training_files = [ os.path.join(args.train, file) @@ -261,7 +262,7 @@ if __name__ == "__main__": else: # Calculate various model performance metrics (regression) - rmse = mean_squared_error(df_val[target], preds, squared=False) + rmse = root_mean_squared_error(df_val[target], preds) mae = mean_absolute_error(df_val[target], preds) r2 = r2_score(df_val[target], preds) print(f"RMSE: {rmse:.3f}") @@ -339,7 +340,7 @@ def predict_fn(df, model) -> pd.DataFrame: """ # Grab our feature columns (from training) - model_dir = os.environ["SM_MODEL_DIR"] + model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model") with open(os.path.join(model_dir, "feature_columns.json")) as fp: model_features = json.load(fp) print(f"Model Features: {model_features}") diff --git a/tests/specific/capital_tests.py b/tests/specific/capital_tests.py index fcd64a87f..4ac939f74 100644 --- a/tests/specific/capital_tests.py +++ b/tests/specific/capital_tests.py @@ -6,8 +6,7 @@ @pytest.mark.long def test(): # Create a new Data Source from an S3 Path (or a local file) - source_path = "s3://workbench-public-data/common/aBaLone.CSV" - # source_path = "/full/path/to/local/file.csv" + source_path = "s3://workbench-public-data/common/abalone.csv" my_data = DataSource(source_path) pprint(my_data.summary()) pprint(my_data.details())