Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/ci-output' into 'main'
Browse files Browse the repository at this point in the history
ci: Better output

See merge request ADLR/megatron-lm!2571
  • Loading branch information
ko3n1g committed Jan 18, 2025
2 parents c7bf403 + 57c392b commit 7ba0d6d
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 35 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.ci.dev
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,6 @@ FROM main as jet
ARG CACHEBUST=0
RUN --mount=type=secret,id=JET_INDEX_URLS \
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
pip install "jet-client<2.0.0" jet-api --upgrade $JET_INDEX_URLS
pip install jet-client jet-api --upgrade $JET_INDEX_URLS
ENV PATH="$PATH:/opt/jet/bin"
###
2 changes: 1 addition & 1 deletion Dockerfile.ci.lts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,6 @@ FROM main as jet
ARG CACHEBUST=0
RUN --mount=type=secret,id=JET_INDEX_URLS \
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
pip install jet-api "jet-client<2.0.0" --upgrade $JET_INDEX_URLS
pip install jet-client jet-api --upgrade $JET_INDEX_URLS
ENV PATH="$PATH:/opt/jet/bin"
###
2 changes: 1 addition & 1 deletion Dockerfile.linting
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ FROM main as jet
ARG CACHEBUST=0
RUN --mount=type=secret,id=JET_INDEX_URLS \
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
pip install "jet-client<2.0.0" jet-api --upgrade $JET_INDEX_URLS
pip install jet-client jet-api --upgrade $JET_INDEX_URLS
ENV PATH="$PATH:/opt/jet/bin"
###
14 changes: 1 addition & 13 deletions tests/functional_tests/shell_test_utils/_run_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,20 +102,8 @@ DISTRIBUTED_ARGS=(
--node_rank $SLURM_NODEID
--log-dir $OUTPUT_PATH
--tee "0:3"
--redirects "3"
)

# Start training
set -e
EXIT_CODE=0
torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS || EXIT_CODE=$?
echo $EXIT_CODE
set +e

find "$OUTPUT_PATH" -type f \( -name "stdout.log" -o -name "stderr.log" \) | while read -r file; do
rank_dir=$(basename "$(dirname "$file")")
mv "$file" "$(dirname "$file")/repeat$REPEAT-run$RUN_NUMBER-rank${rank_dir}-$(basename "$file")"
done

if [[ $EXIT_CODE -ne 0 ]]; then
exit $EXIT_CODE
fi
24 changes: 18 additions & 6 deletions tests/test_utils/python_scripts/launch_jet_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from tests.test_utils.python_scripts import common

BASE_PATH = pathlib.Path(__file__).parent.resolve()

GITLAB_PREFIX = "assets/basic/tests-unit-tests-data-{environment}-{tag}/"

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -125,11 +125,18 @@ def download_job_assets(logs: List[jet_log.JETLog], iteration: int = 0) -> List[
assets = log.get_assets()
assets_path = assets_base_path / f"restart={restart_idx}"
assets_path.mkdir(parents=True, exist_ok=True)
for log_filename in assets.keys():
with open(assets_path / log_filename, "w") as fh:
for asset in assets:
(assets_path / asset.source_path.removeprefix(GITLAB_PREFIX)).parent.mkdir(
parents=True, exist_ok=True
)
with open(assets_path / asset.source_path.removeprefix(GITLAB_PREFIX), "w") as fh:
dest = pathlib.Path(fh.name)
logger.info("Downloading log %s to %s", log_filename, str(dest))
assets[log_filename].download(dest)
logger.info(
"Downloading log %s to %s",
asset.source_path.removeprefix(GITLAB_PREFIX),
str(dest),
)
asset.download(dest)
return assets


Expand All @@ -139,7 +146,9 @@ def extract_logs_to_string(logs: List[jet_log.JETLog]) -> List[str]:
return [""]

with tempfile.NamedTemporaryFile() as tmp_file:
logs[-1].get_assets()["output_script-0.log"].download(pathlib.Path(tmp_file.name))
assets = logs[-1].get_assets()
asset = [asset for asset in assets if asset.name == "output_script-0.log"][0]
asset.download(pathlib.Path(tmp_file.name))
with open(pathlib.Path(tmp_file.name), "r") as fh:
return fh.readlines()

Expand Down Expand Up @@ -211,6 +220,9 @@ def main(
logging.basicConfig(level=logging.INFO)
logger.info('Started')

global GITLAB_PREFIX
GITLAB_PREFIX = GITLAB_PREFIX.format(environment=environment, tag=tag)

model_config_path = pathlib.Path(
BASE_PATH
/ ".."
Expand Down
13 changes: 0 additions & 13 deletions tests/test_utils/recipes/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,8 @@ spec:
for i in $(seq $UNIT_TEST_REPEAT); do
CMD=$(echo torchrun ${{DISTRIBUTED_ARGS[@]}} -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail ${{IGNORE_ARGS[@]}} -m "'${{MARKER_ARG}}'" $BUCKET)
set -e
EXIT_CODE=0
eval "$CMD" || EXIT_CODE=$?
echo $EXIT_CODE
set +e
if [[ $EXIT_CODE -ne 0 ]]; then
break
fi
done
find "{assets_dir}" -type f \( -name "stdout.log" -o -name "stderr.log" \) | while read -r file; do
rank_dir=$(basename "$(dirname "$file")")
mv "$file" "$(dirname "$file")/${{rank_dir}}-$(basename "$file")"
done
products:
Expand Down

0 comments on commit 7ba0d6d

Please sign in to comment.