diff --git a/docker/dummy.ubuntu.amd.Dockerfile b/docker/dummy.ubuntu.amd.Dockerfile index ee2bf72..546511e 100644 --- a/docker/dummy.ubuntu.amd.Dockerfile +++ b/docker/dummy.ubuntu.amd.Dockerfile @@ -1,3 +1,3 @@ # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} -ARG BASE_DOCKER=rocm/pytorch +ARG BASE_DOCKER=docker.io/rocm/pytorch FROM $BASE_DOCKER diff --git a/docker/jax_maxtext.ubuntu.amd.Dockerfile b/docker/jax_maxtext.ubuntu.amd.Dockerfile index f7d14d4..646a824 100644 --- a/docker/jax_maxtext.ubuntu.amd.Dockerfile +++ b/docker/jax_maxtext.ubuntu.amd.Dockerfile @@ -21,7 +21,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/jax-training:maxtext-v25.7-jax060 +ARG BASE_DOCKER=docker.iorocm/jax-training:maxtext-v25.7-jax060 FROM $BASE_DOCKER USER root diff --git a/docker/megatron_train.ubuntu.amd.Dockerfile b/docker/megatron_train.ubuntu.amd.Dockerfile index 5a5852b..ef2a4a5 100644 --- a/docker/megatron_train.ubuntu.amd.Dockerfile +++ b/docker/megatron_train.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/megatron-lm:v25.8_py310 +ARG BASE_DOCKER=docker.io/rocm/megatron-lm:v25.7_py310 FROM $BASE_DOCKER USER root diff --git a/docker/primus_megatron_train.ubuntu.amd.Dockerfile b/docker/primus_megatron_train.ubuntu.amd.Dockerfile index 5a5852b..ef2a4a5 100644 --- a/docker/primus_megatron_train.ubuntu.amd.Dockerfile +++ b/docker/primus_megatron_train.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/megatron-lm:v25.8_py310 +ARG BASE_DOCKER=docker.io/rocm/megatron-lm:v25.7_py310 FROM $BASE_DOCKER USER root diff --git a/docker/primus_pytorch_train.ubuntu.amd.Dockerfile b/docker/primus_pytorch_train.ubuntu.amd.Dockerfile index c416510..8b67d48 100644 --- a/docker/primus_pytorch_train.ubuntu.amd.Dockerfile +++ b/docker/primus_pytorch_train.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch-training:v25.8 +ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.8 FROM $BASE_DOCKER USER root diff --git a/docker/pyt_chai1_inference.ubuntu.amd.Dockerfile b/docker/pyt_chai1_inference.ubuntu.amd.Dockerfile index d4a6782..53efbdf 100644 --- a/docker/pyt_chai1_inference.ubuntu.amd.Dockerfile +++ b/docker/pyt_chai1_inference.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER USER root ENV WORKSPACE_DIR=/workspace diff --git a/docker/pyt_clip_inference.ubuntu.amd.Dockerfile b/docker/pyt_clip_inference.ubuntu.amd.Dockerfile index 0b53231..a82f26d 100644 --- a/docker/pyt_clip_inference.ubuntu.amd.Dockerfile +++ b/docker/pyt_clip_inference.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER USER root diff --git a/docker/pyt_huggingface.ubuntu.amd.Dockerfile b/docker/pyt_huggingface.ubuntu.amd.Dockerfile index 3ab9eaa..19f1b74 100644 --- a/docker/pyt_huggingface.ubuntu.amd.Dockerfile +++ b/docker/pyt_huggingface.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER USER root @@ -63,7 +63,9 @@ RUN apt-get install -y netcat-traditional RUN apt-get install -y locales RUN locale-gen en_US.UTF-8 +# the model complains about numpy version, it requires <2.0.0 +RUN pip uninstall -y numpy +RUN pip install numpy==1.26.4 + # record configuration for posterity RUN pip3 list - - diff --git a/docker/pyt_hy_video.ubuntu.amd.Dockerfile b/docker/pyt_hy_video.ubuntu.amd.Dockerfile index 062fc9f..703a3f5 100644 --- a/docker/pyt_hy_video.ubuntu.amd.Dockerfile +++ b/docker/pyt_hy_video.ubuntu.amd.Dockerfile @@ -1,5 +1,5 @@ # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER ARG work_dir=/hunyuanvideo diff --git a/docker/pyt_janus_pro_inference.ubuntu.amd.Dockerfile b/docker/pyt_janus_pro_inference.ubuntu.amd.Dockerfile index 0a9b6c8..639ad94 100644 --- a/docker/pyt_janus_pro_inference.ubuntu.amd.Dockerfile +++ b/docker/pyt_janus_pro_inference.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER USER root diff --git a/docker/pyt_mochi_inference.ubuntu.amd.Dockerfile b/docker/pyt_mochi_inference.ubuntu.amd.Dockerfile index 4cbbd1e..912e9ee 100644 --- a/docker/pyt_mochi_inference.ubuntu.amd.Dockerfile +++ b/docker/pyt_mochi_inference.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER USER root diff --git a/docker/pyt_mpt30b_training.ubuntu.amd.Dockerfile b/docker/pyt_mpt30b_training.ubuntu.amd.Dockerfile index a4d1f88..26dacbc 100644 --- a/docker/pyt_mpt30b_training.ubuntu.amd.Dockerfile +++ b/docker/pyt_mpt30b_training.ubuntu.amd.Dockerfile @@ -1,6 +1,6 @@ # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} # PyTorch 2.7.0a0+git3a58512 -ARG BASE_DOCKER=rocm/pytorch-training:v25.5 +ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5 FROM $BASE_DOCKER WORKDIR /workspace diff --git a/docker/pyt_ncf_training.ubuntu.amd.Dockerfile b/docker/pyt_ncf_training.ubuntu.amd.Dockerfile index 96aa74e..f65b232 100644 --- a/docker/pyt_ncf_training.ubuntu.amd.Dockerfile +++ b/docker/pyt_ncf_training.ubuntu.amd.Dockerfile @@ -1,5 +1,5 @@ # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} -ARG BASE_DOCKER=rocm/pytorch-training:v25.5 +ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5 FROM $BASE_DOCKER diff --git a/docker/pyt_sglang.ubuntu.amd.Dockerfile b/docker/pyt_sglang.ubuntu.amd.Dockerfile index 9fdce6c..1497815 100644 --- a/docker/pyt_sglang.ubuntu.amd.Dockerfile +++ b/docker/pyt_sglang.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=lmsysorg/sglang:v0.4.5-rocm630 +ARG BASE_DOCKER=docker.io/lmsysorg/sglang:v0.4.5-rocm630 FROM $BASE_DOCKER diff --git a/docker/pyt_training_huggingface.ubuntu.amd.Dockerfile b/docker/pyt_training_huggingface.ubuntu.amd.Dockerfile index 4cd0248..f69672e 100644 --- a/docker/pyt_training_huggingface.ubuntu.amd.Dockerfile +++ b/docker/pyt_training_huggingface.ubuntu.amd.Dockerfile @@ -1,5 +1,5 @@ # CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} -ARG BASE_DOCKER=rocm/pytorch-training:v25.5 +ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.5 FROM $BASE_DOCKER USER root diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile index 4f5d573..d21c29e 100644 --- a/docker/pyt_vllm.ubuntu.amd.Dockerfile +++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909 +ARG BASE_DOCKER=docker.io/rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909 FROM $BASE_DOCKER USER root diff --git a/docker/pyt_wan2.1_inference.ubuntu.amd.Dockerfile b/docker/pyt_wan2.1_inference.ubuntu.amd.Dockerfile index 55e0504..3c62148 100644 --- a/docker/pyt_wan2.1_inference.ubuntu.amd.Dockerfile +++ b/docker/pyt_wan2.1_inference.ubuntu.amd.Dockerfile @@ -25,7 +25,7 @@ # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch:latest +ARG BASE_DOCKER=docker.io/rocm/pytorch:latest FROM $BASE_DOCKER USER root ENV WORKSPACE_DIR=/workspace @@ -89,4 +89,3 @@ RUN cd $WORKSPACE_DIR \ # Display installed packages for verification RUN pip list - diff --git a/docker/pytorch_train.ubuntu.amd.Dockerfile b/docker/pytorch_train.ubuntu.amd.Dockerfile index c416510..a519155 100644 --- a/docker/pytorch_train.ubuntu.amd.Dockerfile +++ b/docker/pytorch_train.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=rocm/pytorch-training:v25.8 +ARG BASE_DOCKER=docker.io/rocm/pytorch-training:v25.7 FROM $BASE_DOCKER USER root diff --git a/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile b/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile index 40681a0..9247579 100644 --- a/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile +++ b/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile @@ -24,7 +24,7 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x +ARG BASE_DOCKER=docker.io/lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x FROM $BASE_DOCKER ARG GPU_ARCH=gfx942 diff --git a/requirements.txt b/requirements.txt index bedfd83..6233d1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -git+https://github.com/ROCm/madengine.git@main +git+https://github.com/danpetreamd/madengine.git@aac_tweaks diff --git a/tools/run_models.py b/tools/run_models.py index 0bbd852..d5d4811 100644 --- a/tools/run_models.py +++ b/tools/run_models.py @@ -26,8 +26,8 @@ ################################################################################# """MAD: Model Automation and Dashboarding -The script builds the Docker image, runs the Docker container, executes training or inference of the LLMs on the container, -and logs the performance metrics. +The script builds the Docker image, runs the Docker container, executes training or inference of the LLMs on the container, +and logs the performance metrics. The script takes the following arguments: --tags: tags to run model. @@ -109,6 +109,7 @@ from utils import get_base_docker, get_base_docker_sha from utils import get_perf_metric, update_dict from utils import update_perf_csv +from utils import get_amdsmi_path, get_nvidiasmi_path from utils import Console, Docker, Timeout, RunDetails from version import __version__ from logger import get_logger @@ -180,12 +181,12 @@ def run_model( console: Console ) -> bool: """Run the model application. - + Args: model_info (dict): The model information args (argparse.Namespace): The input arguments console (Console): The console object - + Returns: bool: The status of the run (return code: True for success, False for failure) """ @@ -195,7 +196,7 @@ def run_model( keep_alive = args.keep_alive keep_model_dir = args.keep_model_dir log_level = args.log_level - output = args.output + output = args.output log_file = f"logs/{model_name}.live.log" # Check the log file exist in the directory or not, if not then create the log file, if exist then empty the log file. @@ -225,7 +226,7 @@ def run_model( run_details.machine_name = get_host_name() run_details.host_os = get_host_os() run_details.gpu_architecture = get_system_gpu_arch() - run_details.n_gpus = get_system_gpus() + run_details.n_gpus = get_system_gpus() run_details.pipeline = os.environ.get('pipeline') # Parse the model dictionary @@ -337,7 +338,7 @@ def run_model( docker_opts += get_env_docker_args(run_envs) docker_opts += get_gpu_docker_args() - # docker_opts += get_cpu_docker_args() + # docker_opts += get_cpu_docker_args() mount_data_paths = [] docker_opts += get_mount_docker_args(mount_data_paths) @@ -352,8 +353,8 @@ def run_model( ) docker = Docker( - image=model_docker_image, - container_name=model_docker_container, + image=model_docker_image, + container_name=model_docker_container, docker_opts=docker_opts, keep_alive=keep_alive, console=console @@ -363,9 +364,11 @@ def run_model( # Echo GPU information if re.search("nvidia", dockerfile_gpu_suffix): - docker.sh('/usr/bin/nvidia-smi || true') + nvidiasmi_path = get_nvidiasmi_path() + docker.sh(f'{nvidiasmi_path} || true') elif re.search("amd", dockerfile_gpu_suffix): - docker.sh('/opt/rocm/bin/rocm-smi || true') + amdsmi_path = get_amdsmi_path() + docker.sh(f'{amdsmi_path} || true') else: logger.error("No GPU information available") raise ValueError("Unknown GPU type") @@ -388,7 +391,7 @@ def run_model( # echo git commit run_details.git_commit = docker.sh(f"cd {model_dir} && git rev-parse HEAD") logger.info(f"MODEL GIT COMMIT is {run_details.git_commit}") - + if model_url: docker.sh(f"cd {model_dir} && git submodule update --init --recursive") @@ -433,7 +436,7 @@ def run_model( # Clean up the instance of docker del docker sys.exit(1) - + test_duration = time.time() - test_start_time logger.info(f"Test duration: {test_duration} seconds") @@ -473,9 +476,9 @@ def run_model( run_details.performance = multiple_results run_details.generate_json("common_info.json", multiple_results=True) update_perf_csv( - multiple_results=model["multiple_results"], - perf_csv=output, - model_name=run_details.model, + multiple_results=model["multiple_results"], + perf_csv=output, + model_name=run_details.model, common_info="common_info.json" ) else: @@ -496,24 +499,24 @@ def run_model( else: run_details.generate_json("perf_entry.json") update_perf_csv(exception_result="perf_entry.json", perf_csv=output) - + except Exception as e: logger.error(f"Failed to write the run details to the output file: {e}") # Clean up the instance of docker del docker - return_code = True if run_details.status == 'SUCCESS' else False + return_code = True if run_details.status == 'SUCCESS' else False return return_code def main() -> bool: """Main function to run the MAD application. - + Returns: bool: The status of the run (return code: True for success, False for failure) - + Raises: ValueError: If the GPU type is unknown """ diff --git a/tools/utils.py b/tools/utils.py index 733a231..7f0e436 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -67,6 +67,7 @@ import re import collections.abc import pandas as pd +import shutil from logger import get_logger @@ -548,6 +549,85 @@ def subprocess_run(cmd: List[str]): return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) +def get_cmd(cmd, known_paths): + ''' + A function to get the full path to the command. + + Args: + cmd (str): command name. + known_paths (list): list of known paths to search for the command. + + Returns: + full path to the command if found, else throws an exception. + ''' + + cmd_path = shutil.which(cmd) + if cmd_path is not None: + return cmd_path + + for path in known_paths: + if not os.path.isdir(path): + continue + + cmd_path = os.path.join(path, cmd) + if os.path.isfile(cmd_path) and os.access(cmd_path, os.X_OK): + return cmd_path + + # throw exception if command not found. + raise FileNotFoundError(f'{cmd} not found.') + + +def get_rocminfo_path(): + """Get the rocminfo command. + + Returns: + str: The absolute path to rocminfo. + """ + rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm") + known_paths = [os.path.join(rocm_path, "bin")] + + return get_cmd("rocminfo", known_paths) + + +def get_rocmsmi_path(): + """Get the rocm-smi command. + + Returns: + str: The absolute path to rocm-smi. + """ + rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm") + known_paths = [os.path.join(rocm_path, "bin")] + + return get_cmd("rocm-smi", known_paths) + + +def get_amdsmi_path(): + """Get the amd-smi command. + + Returns: + str: The absolute path to amd-smi. + """ + rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm") + known_paths = [os.path.join(rocm_path, "bin")] + + return get_cmd("amd-smi", known_paths) + + +def get_nvidiasmi_path(): + """Get the nvidia-smi command. + + Returns: + str: The absolute path to nvidia-smi. + """ + cuda_path = os.environ.get("CUDA_PATH", "/usr/local/cuda") + known_paths = [ + "/usr/bin", + "/usr/local/bin", + os.path.join(cuda_path, "bin") + ] + + return get_cmd("nvidia-smi", known_paths) + def get_gpu_vendor() -> str: """Get the GPU vendor. @@ -562,11 +642,13 @@ def get_gpu_vendor() -> str: ERRORS = (FileNotFoundError, subprocess.CalledProcessError) try: - _ = subprocess_run(["/usr/bin/nvidia-smi"]) + nvidiasmi_path = get_nvidiasmi_path() + _ = subprocess_run([f"{nvidiasmi_path}"]) except ERRORS as e1: + amdsmi_path = get_amdsmi_path() try: - _ = subprocess_run(["/opt/rocm/bin/rocm-smi"]) + _ = subprocess_run([f"{amdsmi_path}"]) except ERRORS as e2: raise Exception("Unsupported GPU: Neither AMD nor NVIDIA") @@ -671,9 +753,10 @@ def get_system_gpus() -> int: ) ) elif gpu_vendor == "AMD": + amdsmi_path = get_amdsmi_path() number_gpus = int( subprocess.check_output( - "rocm-smi --showid --csv | grep card | wc -l", shell=True + f"{amdsmi_path} --showid --csv | grep card | wc -l", shell=True ) ) else: @@ -765,9 +848,10 @@ def get_system_gpu_arch() -> str: else: raise Exception(f"Failed to get GPU architecture of NVIDIA: {gpu_name}") elif gpu_vendor == "AMD": + rocminfo_path = get_rocminfo_path() gpu_arch = ( subprocess.check_output( - "/opt/rocm/bin/rocminfo |grep -o -m 1 'gfx.*'", shell=True + f"{rocminfo_path} | grep -o -m 1 'gfx.*'", shell=True ) .decode("utf-8") .strip() @@ -1265,7 +1349,7 @@ def update_perf_csv( # Read the perf.csv perf_csv_df = df_strip_columns(pd.read_csv(perf_csv)) logger.info(perf_csv_df) - + # Handle the results if multiple_results: perf_csv_df = handle_multiple_results(