Skip to content

Commit

Permalink
Enable back fsdp example (#227)
Browse files Browse the repository at this point in the history
* Enable back fsdp example

Signed-off-by: Jerome <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jerome-habana and pre-commit-ci[bot] authored Aug 27, 2024
1 parent d324ab9 commit 4be6cb0
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 10 deletions.
19 changes: 12 additions & 7 deletions .azure/hpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ jobs:
tests/test_pytorch/test_accelerator.py \
tests/test_pytorch/test_compile.py \
tests/test_pytorch/test_profiler.py
# work around to mitigate tenancy issue in G1 for cards 0,1,2,3
condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
displayName: 'Multi card(2) HPU test'
# - bash: |
Expand All @@ -169,14 +171,16 @@ jobs:
# PT_HPU_LAZY_MODE: 0
# displayName: 'FSDP PT Multi card(2) HPU test'

# - bash: |
# bash tests/run_standalone_tests.sh --hpus 2 -f \
# tests/test_fabric/test_fsdp.py
# env:
# PT_HPU_LAZY_MODE: 0
# displayName: 'FSDP Fabric Multi card(2) HPU test'
- bash: |
bash tests/run_standalone_tests.sh --hpus 2 -f \
tests/test_fabric/test_fsdp.py
env:
PT_HPU_LAZY_MODE: 0
condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
displayName: 'FSDP Fabric Multi card(2) HPU test'
- bash: pip install ".[examples]"
condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
displayName: 'Install extra for examples'

- bash: |
Expand All @@ -185,8 +189,9 @@ jobs:
python pytorch/hpu_graphs.py -v train --mode capture_and_replay make_graphed_callables modulecacher
python pytorch/hpu_graphs.py -v inference --mode capture_and_replay wrap_in_hpu_graph
python pytorch/hpu_graphs.py -v dynamicity --mode dynamic_control_flow dynamic_ops
# PT_HPU_LAZY_MODE=0 python pytorch/language_model.py -s SHARD_GRAD_OP -d 2
PT_HPU_LAZY_MODE=0 python pytorch/language_model.py -s SHARD_GRAD_OP -d 2
workingDirectory: examples/
condition: or(eq(variables['HABANA_VISIBLE_MODULES'], '4,5'), eq(variables['HABANA_VISIBLE_MODULES'], '6,7'))
displayName: 'Testing HPU examples'
- task: PublishTestResults@2
Expand Down
13 changes: 10 additions & 3 deletions examples/pytorch/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def parse_args():

policy = {nn.TransformerEncoderLayer, nn.TransformerDecoderLayer}
dataset = WikiText2()
train_dataloader = DataLoader(dataset)
train_dataloader = DataLoader(dataset, num_workers=2)

if options.strategy == "DDP":
model = LanguageModel(vocab_size=dataset.vocab_size)
Expand All @@ -120,7 +120,6 @@ def parse_args():
f"Peak Memory alloc using DDP strategy on HPU: {htorch.hpu.max_memory_allocated() / (1024**3)} GB"
)
else:
htorch.hpu.reset_peak_memory_stats()
model = LanguageModel(vocab_size=dataset.vocab_size)
_strategy = HPUFSDPStrategy(
parallel_devices=[torch.device("hpu")] * options.devices,
Expand All @@ -129,9 +128,17 @@ def parse_args():
precision_plugin=HPUFSDPPrecision("bf16-mixed"),
)

trainer = Trainer(accelerator=HPUAccelerator(), strategy=_strategy, fast_dev_run=1, enable_model_summary=True)
trainer = Trainer(
accelerator=HPUAccelerator(),
devices=options.devices,
strategy=_strategy,
fast_dev_run=1,
enable_model_summary=True,
)
trainer.fit(model, train_dataloader)
rank_zero_info(
f"Peak Memory alloc using FSDP {options.strategy} strategy "
f" on HPU: {htorch.hpu.max_memory_allocated() / (1024**3)} GB"
)

htorch.hpu.reset_peak_memory_stats()

0 comments on commit 4be6cb0

Please sign in to comment.