diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100644 index 0000000000..159b85a818 --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,31 @@ +#!/bin/sh +# This pre-commit hook validates S3 bucket references in modified files + +set -e + +# Get list of staged files +staged_files=$(git diff --cached --name-only --diff-filter=ACMR | grep -E '\.(py|ipynb|md|rst|yaml|yml|json)$' || true) + +if [ -z "$staged_files" ]; then + echo "No relevant files to check for S3 bucket references." + exit 0 +fi + +echo "Checking S3 bucket references in staged files..." +has_invalid_buckets=0 + +for file in $staged_files; do + echo "Validating S3 references in $file" + python s3_bucket_validator.py "$file" + if [ $? -ne 0 ]; then + has_invalid_buckets=1 + fi +done + +if [ $has_invalid_buckets -ne 0 ]; then + echo "ERROR: Invalid S3 bucket references found. Please fix them before committing." + exit 1 +fi + +echo "S3 bucket validation passed." +exit 0 diff --git a/s3_bucket_validator.py b/s3_bucket_validator.py new file mode 100644 index 0000000000..4d86294096 --- /dev/null +++ b/s3_bucket_validator.py @@ -0,0 +1,54 @@ +from __future__ import absolute_import + +import re + +import boto3 +from botocore.exceptions import ClientError + + +def is_bucket_accessible(bucket_name): + s3 = boto3.client("s3") + try: + s3.head_bucket(Bucket=bucket_name) + return True + except ClientError as e: + error_code = int(e.response["Error"]["Code"]) + if error_code == 403: + print(f"Bucket {bucket_name} exists, but you don't have permission to access it.") + elif error_code == 404: + print(f"Bucket {bucket_name} does not exist.") + else: + print(f"Error checking bucket {bucket_name}: {e}") + return False + + +def validate_s3_references(file_path): + with open(file_path, "r") as file: + content = file.read() + + s3_pattern = re.compile(r"s3:\/\/([a-zA-Z0-9._-]+)") + matches = s3_pattern.findall(content) + + invalid_buckets = [] + for bucket in matches: + if not is_bucket_accessible(bucket): + invalid_buckets.append(bucket) + + return invalid_buckets + + +if __name__ == "__main__": + import sys + + if len(sys.argv) < 2: + print("Usage: python s3_bucket_validator.py ") + sys.exit(1) + + file_path = sys.argv[1] + invalid_buckets = validate_s3_references(file_path) + + if invalid_buckets: + print(f"Invalid or inaccessible S3 buckets found: {', '.join(invalid_buckets)}") + sys.exit(1) + else: + print("All referenced S3 buckets are valid and accessible.")