From 4be6cb000f99158bd4b14e34250d0b75c5765b64 Mon Sep 17 00:00:00 2001
From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com>
Date: Tue, 27 Aug 2024 12:40:42 +0530
Subject: [PATCH] Enable back fsdp example (#227)

* Enable back fsdp example

Signed-off-by: Jerome <janand@habana.ai>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .azure/hpu-tests.yml               | 19 ++++++++++++-------
 examples/pytorch/language_model.py | 13 ++++++++++---
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml
index 56128cd1..174bed85 100644
--- a/.azure/hpu-tests.yml
+++ b/.azure/hpu-tests.yml
@@ -160,6 +160,8 @@ jobs:
           tests/test_pytorch/test_accelerator.py \
           tests/test_pytorch/test_compile.py \
           tests/test_pytorch/test_profiler.py
+      # work around to mitigate tenancy issue in G1 for cards 0,1,2,3
+      condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
       displayName: 'Multi card(2) HPU test'
 
     # - bash: |
@@ -169,14 +171,16 @@ jobs:
     #     PT_HPU_LAZY_MODE: 0
     #   displayName: 'FSDP PT Multi card(2) HPU test'
 
-    # - bash: |
-    #     bash tests/run_standalone_tests.sh --hpus 2 -f \
-    #       tests/test_fabric/test_fsdp.py
-    #   env:
-    #     PT_HPU_LAZY_MODE: 0
-    #   displayName: 'FSDP Fabric Multi card(2) HPU test'
+    - bash: |
+        bash tests/run_standalone_tests.sh --hpus 2 -f \
+          tests/test_fabric/test_fsdp.py
+      env:
+        PT_HPU_LAZY_MODE: 0
+      condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
+      displayName: 'FSDP Fabric Multi card(2) HPU test'
 
     - bash: pip install ".[examples]"
+      condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
       displayName: 'Install extra for examples'
 
     - bash: |
@@ -185,8 +189,9 @@ jobs:
         python pytorch/hpu_graphs.py -v train --mode capture_and_replay make_graphed_callables modulecacher
         python pytorch/hpu_graphs.py -v inference --mode capture_and_replay wrap_in_hpu_graph
         python pytorch/hpu_graphs.py -v dynamicity --mode dynamic_control_flow dynamic_ops
-    #    PT_HPU_LAZY_MODE=0 python pytorch/language_model.py -s SHARD_GRAD_OP -d 2
+        PT_HPU_LAZY_MODE=0 python pytorch/language_model.py -s SHARD_GRAD_OP -d 2
       workingDirectory: examples/
+      condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
       displayName: 'Testing HPU examples'
 
     - task: PublishTestResults@2
diff --git a/examples/pytorch/language_model.py b/examples/pytorch/language_model.py
index ae16a171..04b89f29 100644
--- a/examples/pytorch/language_model.py
+++ b/examples/pytorch/language_model.py
@@ -101,7 +101,7 @@ def parse_args():
 
     policy = {nn.TransformerEncoderLayer, nn.TransformerDecoderLayer}
     dataset = WikiText2()
-    train_dataloader = DataLoader(dataset)
+    train_dataloader = DataLoader(dataset, num_workers=2)
 
     if options.strategy == "DDP":
         model = LanguageModel(vocab_size=dataset.vocab_size)
@@ -120,7 +120,6 @@ def parse_args():
             f"Peak Memory alloc using DDP strategy on HPU: {htorch.hpu.max_memory_allocated() / (1024**3)} GB"
         )
     else:
-        htorch.hpu.reset_peak_memory_stats()
         model = LanguageModel(vocab_size=dataset.vocab_size)
         _strategy = HPUFSDPStrategy(
             parallel_devices=[torch.device("hpu")] * options.devices,
@@ -129,9 +128,17 @@ def parse_args():
             precision_plugin=HPUFSDPPrecision("bf16-mixed"),
         )
 
-        trainer = Trainer(accelerator=HPUAccelerator(), strategy=_strategy, fast_dev_run=1, enable_model_summary=True)
+        trainer = Trainer(
+            accelerator=HPUAccelerator(),
+            devices=options.devices,
+            strategy=_strategy,
+            fast_dev_run=1,
+            enable_model_summary=True,
+        )
         trainer.fit(model, train_dataloader)
         rank_zero_info(
             f"Peak Memory alloc using FSDP {options.strategy} strategy "
             f" on HPU: {htorch.hpu.max_memory_allocated() / (1024**3)} GB"
         )
+
+    htorch.hpu.reset_peak_memory_stats()