From 4be6cb000f99158bd4b14e34250d0b75c5765b64 Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Tue, 27 Aug 2024 12:40:42 +0530 Subject: [PATCH] Enable back fsdp example (#227) * Enable back fsdp example Signed-off-by: Jerome Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .azure/hpu-tests.yml | 19 ++++++++++++------- examples/pytorch/language_model.py | 13 ++++++++++--- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 56128cd1..174bed85 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -160,6 +160,8 @@ jobs: tests/test_pytorch/test_accelerator.py \ tests/test_pytorch/test_compile.py \ tests/test_pytorch/test_profiler.py + # work around to mitigate tenancy issue in G1 for cards 0,1,2,3 + condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7')) displayName: 'Multi card(2) HPU test' # - bash: | @@ -169,14 +171,16 @@ jobs: # PT_HPU_LAZY_MODE: 0 # displayName: 'FSDP PT Multi card(2) HPU test' - # - bash: | - # bash tests/run_standalone_tests.sh --hpus 2 -f \ - # tests/test_fabric/test_fsdp.py - # env: - # PT_HPU_LAZY_MODE: 0 - # displayName: 'FSDP Fabric Multi card(2) HPU test' + - bash: | + bash tests/run_standalone_tests.sh --hpus 2 -f \ + tests/test_fabric/test_fsdp.py + env: + PT_HPU_LAZY_MODE: 0 + condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7')) + displayName: 'FSDP Fabric Multi card(2) HPU test' - bash: pip install ".[examples]" + condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7')) displayName: 'Install extra for examples' - bash: | @@ -185,8 +189,9 @@ jobs: python pytorch/hpu_graphs.py -v train --mode capture_and_replay make_graphed_callables modulecacher python pytorch/hpu_graphs.py -v inference --mode capture_and_replay wrap_in_hpu_graph python pytorch/hpu_graphs.py -v dynamicity --mode dynamic_control_flow dynamic_ops - # PT_HPU_LAZY_MODE=0 python pytorch/language_model.py -s SHARD_GRAD_OP -d 2 + PT_HPU_LAZY_MODE=0 python pytorch/language_model.py -s SHARD_GRAD_OP -d 2 workingDirectory: examples/ + condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7')) displayName: 'Testing HPU examples' - task: PublishTestResults@2 diff --git a/examples/pytorch/language_model.py b/examples/pytorch/language_model.py index ae16a171..04b89f29 100644 --- a/examples/pytorch/language_model.py +++ b/examples/pytorch/language_model.py @@ -101,7 +101,7 @@ def parse_args(): policy = {nn.TransformerEncoderLayer, nn.TransformerDecoderLayer} dataset = WikiText2() - train_dataloader = DataLoader(dataset) + train_dataloader = DataLoader(dataset, num_workers=2) if options.strategy == "DDP": model = LanguageModel(vocab_size=dataset.vocab_size) @@ -120,7 +120,6 @@ def parse_args(): f"Peak Memory alloc using DDP strategy on HPU: {htorch.hpu.max_memory_allocated() / (1024**3)} GB" ) else: - htorch.hpu.reset_peak_memory_stats() model = LanguageModel(vocab_size=dataset.vocab_size) _strategy = HPUFSDPStrategy( parallel_devices=[torch.device("hpu")] * options.devices, @@ -129,9 +128,17 @@ def parse_args(): precision_plugin=HPUFSDPPrecision("bf16-mixed"), ) - trainer = Trainer(accelerator=HPUAccelerator(), strategy=_strategy, fast_dev_run=1, enable_model_summary=True) + trainer = Trainer( + accelerator=HPUAccelerator(), + devices=options.devices, + strategy=_strategy, + fast_dev_run=1, + enable_model_summary=True, + ) trainer.fit(model, train_dataloader) rank_zero_info( f"Peak Memory alloc using FSDP {options.strategy} strategy " f" on HPU: {htorch.hpu.max_memory_allocated() / (1024**3)} GB" ) + + htorch.hpu.reset_peak_memory_stats()