Skip to content

[Feature] Support Chatqna one click deployment #1989

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 315 additions & 0 deletions ChatQnA/deployment/check_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
#!/bin/bash
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Script's own directory
SCRIPT_DIR_ENV=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
# Source utility functions
source "${SCRIPT_DIR_ENV}/utils.sh" # Assuming utils.sh is in the same directory

# =================== CONFIG ===================
REQUIRED_CMDS=("git" "docker" "curl" "vim" "nproc") # Added nproc for CPU check
THRESHOLD_GB=5
CHATQNA_MIN_CPU=96
CHATQNA_MIN_HPU=8
# LOG_FILE is now managed by utils.sh, can be overridden here if needed:
# LOG_FILE="check_env.log" # Specific log file for this script

# ================ ARG PARSING =================
DEVICE="" # Standardized to DEVICE
DEPLOYMENT_MODE="online" # Default to online, can be overridden

while [[ $# -gt 0 ]]; do
case "$1" in
--log-file)
LOG_FILE="$2" # Allow overriding utils.sh default
shift 2
;;
--device) # Changed from --xeon/--gaudi to --device xeon/gaudi
DEVICE="$2"
if [[ "$DEVICE" != "xeon" && "$DEVICE" != "gaudi" ]]; then
log ERROR "Invalid --device: $DEVICE. Must be 'xeon' or 'gaudi'."
exit 1
fi
shift 2
;;
--mode) # For online/offline package installation
DEPLOYMENT_MODE="$2"
if [[ "$DEPLOYMENT_MODE" != "online" && "$DEPLOYMENT_MODE" != "offline" ]]; then
log ERROR "--mode must be either 'online' or 'offline'."
exit 1
fi
shift 2
;;
*)
log WARN "Unknown argument: $1"
shift
;;
esac
done

if [[ -z "$DEVICE" ]]; then
log ERROR "Usage: $0 --device [xeon|gaudi] [--mode online|offline] [--log-file <filename>]"
exit 1
fi

log INFO "Starting environment check for DEVICE: $DEVICE, MODE: $DEPLOYMENT_MODE"

# ================ DRIVER CHECKS =============
check_habana_modules() {
log INFO "Checking Habana kernel modules..."
local missing=0
local required_modules=("habanalabs" "habanalabs_cn" "habanalabs_ib" "habanalabs_en")
for module in "${required_modules[@]}"; do
if ! lsmod | grep -q "^${module}\s"; then
log ERROR "Module missing: ${module}"
missing=$((missing + 1))
else
log OK "Module loaded: ${module}"
fi
done
if [ $missing -gt 0 ]; then
log ERROR "$missing required Habana modules missing. Please ensure Habana drivers are correctly installed."
return 1 # Changed to return 1 for main to catch
else
log OK "All required Habana modules loaded."
return 0
fi
}

check_habana_sw() {
log INFO "Listing installed Habana packages..."
if ! apt list --installed 2>/dev/null | grep habana | tee -a "$LOG_FILE"; then
log WARN "No Habana packages found via 'apt list'. This might be normal if installed differently, or an issue."
fi
}

hl_qual_test() {
log INFO "Starting Gaudi Function Tests (hl_qual)..."
# Check if hl_qual exists
if ! command_exists /opt/habanalabs/qual/gaudi2/bin/hl_qual; then
log ERROR "hl_qual command not found at /opt/habanalabs/qual/gaudi2/bin/hl_qual. Skipping test."
return 1
fi
export HABANA_LOGS="/tmp/hl_qual.log"
# Temporarily cd for test, ensure we cd back
local current_dir=$(pwd)
cd /opt/habanalabs/qual/gaudi2/bin || { log ERROR "Failed to cd to hl_qual directory."; return 1; }

# Run test, capture output and exit code
# Reduced verbosity for general log, full output to $LOG_FILE
# Consider making '-l extreme -t 2' configurable or less aggressive for a quick check
output=$(./hl_qual -gaudi2 -c all -rmod parallel -f2 -l extreme -t 2 2>&1)
exit_code=$?
cd "$current_dir" # Go back to original directory

echo "$output" >> "$LOG_FILE" # Log full output

if [ $exit_code -ne 0 ]; then
log ERROR "hl_qual test failed (exit code: $exit_code). Check $LOG_FILE for details."
return 1
fi

# More robust parsing of PASSED
if echo "$output" | grep -q "hl qual report" && echo "$output" | grep -q "PASSED"; then
log OK "hl_qual test PASSED."
else
log ERROR "hl_qual test FAILED (did not detect 'PASSED' after 'hl qual report'). Check $LOG_FILE for details."
return 1
fi
return 0
}

check_hl_cards() {
log INFO "Checking Habana HPU cards via hl-smi..."
if ! command_exists hl-smi; then
log ERROR "hl-smi command not found. Cannot check HPU cards."
return 1
fi
local total_cards available_cards
# Ensure hl-smi output is parsable, handle errors
if ! hl_smi_output=$(hl-smi); then
log ERROR "hl-smi command failed to execute properly."
return 1
fi

total_cards=$(echo "$hl_smi_output" | grep -E '^\| +[0-9]+ +HL-' | wc -l)
# Assuming 'N/A' in a specific column indicates available for use (this might need verification based on actual hl-smi output for "available")
# A more robust check might be needed depending on what "available" truly means in context of hl-smi
available_cards=$(echo "$hl_smi_output" | grep -E '^\| +[0-9]+ +N/A' | wc -l) # This interpretation of "available" might need review

log INFO "Total HL-SMI cards detected: $total_cards"
log INFO "Cards reported as 'N/A' (interpreted as available): $available_cards"

if [ "$available_cards" -lt "$CHATQNA_MIN_HPU" ]; then
log ERROR "Available HPU cards ($available_cards) is less than required minimum ($CHATQNA_MIN_HPU)."
return 1
else
log OK "Sufficient HPU cards available ($available_cards >= $CHATQNA_MIN_HPU)."
fi
return 0
}

get_cpu_info() {
log INFO "Checking CPU information..."
local cpu_model total_cores
cpu_model=$(grep -m1 'model name' /proc/cpuinfo | cut -d ':' -f2 | sed 's/^[[:space:]]*//')

# Using nproc --all for logical cores, or lscpu for physical if preferred.
if command_exists nproc; then
total_cores=$(nproc --all)
elif command_exists lscpu; then # Fallback if nproc is not available
total_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}')
else
log WARN "Cannot determine CPU core count (nproc and lscpu not found). Skipping CPU core check."
return 0 # Or return 1 if this is critical
fi

log INFO "CPU Model: $cpu_model"
log INFO "Total logical CPU cores: $total_cores"

if [ "$total_cores" -lt "$CHATQNA_MIN_CPU" ]; then
log ERROR "Detected CPU cores ($total_cores) is less than required minimum ($CHATQNA_MIN_CPU)."
return 1
else
log OK "CPU core count ($total_cores) is sufficient."
fi

log INFO "Attempting to set CPU governor to performance mode..."
if echo "performance" | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor > /dev/null 2>&1; then
log OK "CPU governor set to performance mode for all cores."
else
log WARN "Failed to set CPU governor to performance. This might require sudo privileges or a different path."
fi
return 0
}

# ================ SOFTWARE CHECKS ===============

install_docker() {
subsection_header "Installing Docker"

# install docker
if ! sudo apt install -y docker.io &>> "$LOG_FILE"; then
log ERROR "Failed to install Docker."
return 1
fi

sudo chmod 666 /var/run/docker.sock

# install docker compose
DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker}
mkdir -p $DOCKER_CONFIG/cli-plugins
curl -SL https://github.com/docker/compose/releases/download/v2.36.0/docker-compose-linux-x86_64 -o $DOCKER_CONFIG/cli-plugins/docker-compose
chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose

if docker compose version &>> "$LOG_FILE"; then
log OK "Docker and Docker Compose successfully installed."
else
log ERROR "Docker Compose installation failed."
return 1
fi

# set docker proxy
sudo mkdir -p /etc/systemd/system/docker.service.d
HTTP_PROXY=${http_proxy:-}
HTTPS_PROXY=${https_proxy:-}
NO_PROXY=${no_proxy:-}

sudo tee /etc/systemd/system/docker.service.d/http-proxy.conf > /dev/null <<EOF
[Service]
Environment="HTTP_PROXY=${http_proxy}" "HTTPS_PROXY=${https_proxy}" "NO_PROXY=${no_proxy}"
EOF

# restart docker
sudo systemctl daemon-reexec
sudo systemctl daemon-reload
sudo systemctl restart docker

log OK "Docker restarted successfully."
log INFO "Docker proxy:"
log INFO "HTTP_PROXY=$HTTP_PROXY"
log INFO "HTTPS_PROXY=$HTTPS_PROXY"
log INFO "NO_PROXY=$NO_PROXY"
}

check_required_commands() {
local all_ready=true
for cmd in "${REQUIRED_CMDS[@]}"; do
if ! command -v "$cmd" &> /dev/null; then
log ERROR "$cmd is not installed, try to install it..."

if [[ "$cmd" == "docker" ]]; then
log INFO "Installing Docker and Docker Compose..."
if ! install_docker; then
all_ready=false
continue
fi
else
log INFO "Attempting to install $cmd..."
apt-get update -qq
apt-get install -y "$cmd" &>> "$LOG_FILE"

if command -v "$cmd" &> /dev/null; then
log OK "$cmd successfully installed."
else
log ERROR "Failed to install $cmd automatically."
all_ready=false
fi
fi
else
log OK "$cmd is available."
fi
done
$all_ready || return 1
}

# ================ DISK SPACE CHECKS ===============
check_disk_space() {
log INFO "Checking available disk space on / ..."
local available_gb
# df -BG / might not be portable, -P is more POSIX compliant, awk for calculation
available_bytes=$(df -P / | awk 'NR==2 {print $4}')
available_gb=$((available_bytes / 1024 / 1024)) # KiloBytes to GigaBytes

log INFO "Available disk space: ${available_gb}GB"
if (( available_gb < THRESHOLD_GB )); then
log ERROR "Not enough disk space. Required: ${THRESHOLD_GB}GB, Available: ${available_gb}GB."
return 1
else
log OK "Disk space is sufficient (${available_gb}GB)."
fi
return 0
}

# ================ MAIN ========================
main() {
local checks_failed=0

section_header "HARDWARE & DRIVER ENVIRONMENT CHECK"
if [[ "$DEVICE" == "xeon" ]]; then
get_cpu_info || checks_failed=$((checks_failed + 1))
elif [[ "$DEVICE" == "gaudi" ]]; then
# For Gaudi, CPU info is also relevant
get_cpu_info || checks_failed=$((checks_failed + 1))
check_habana_modules || checks_failed=$((checks_failed + 1))
check_habana_sw # This is informational, not typically a fail condition
check_hl_cards || checks_failed=$((checks_failed + 1))
hl_qual_test || checks_failed=$((checks_failed + 1))
fi

section_header "SOFTWARE DEPENDENCIES CHECK"
check_required_commands || checks_failed=$((checks_failed + 1))

section_header "DISK SPACE CHECK"
check_disk_space || checks_failed=$((checks_failed + 1))

if [ $checks_failed -gt 0 ]; then
log ERROR "Environment check failed with $checks_failed error(s). Please review the logs."
exit 1
else
log OK "All environment checks passed. System should be ready for deployment."
fi
}

main
Loading
Loading