Compile test fsdp (#160)

y-sq · facebook-github-bot · commit c40de9b16e84 · 2023-12-19T22:58:58.000-08:00
Summary: Added an option in test_fsdp.py to compile fsdp. With compile mode, the numerics check can still pass. However, note that the compile now works with a workaround. And fullgraph needs to be False. We still need to fix the issue. When running "./test/test_fsdp.sh", three settings will be testet: 1. Fp8 = False 2. Fp8 = True, Compile = False 3. Fp8 = True, Compile = True (with fullgraph = False) For example: ``` $ ./test/test_fsdp.sh launching IS_FP8 False, compile_fsdp False, fullgraph False -------------------------------------------Mode: generate------------------------------------------- Success: ✅ ------------------------------------------Mode: single_gpu------------------------------------------ Success: ✅ ---------------------------------------------Mode: fsdp--------------------------------------------- NCCL version 2.19.3+cuda12.1 -------------------------------------------Mode: analyze-------------------------------------------- output testing single_gpu vs FSDP success state dict testing single_gpu vs FSDP success Success: ✅ ✅ All Tests Passed ✅ launching IS_FP8 True, compile_fsdp False, fullgraph False -------------------------------------------Mode: generate------------------------------------------- Success: ✅ ------------------------------------------Mode: single_gpu------------------------------------------ Success: ✅ ---------------------------------------------Mode: fsdp--------------------------------------------- NCCL version 2.19.3+cuda12.1 -------------------------------------------Mode: analyze-------------------------------------------- output testing single_gpu vs FSDP success state dict testing single_gpu vs FSDP success Success: ✅ ✅ All Tests Passed ✅ launching IS_FP8 True, compile_fsdp True, fullgraph False -------------------------------------------Mode: generate------------------------------------------- Success: ✅ ------------------------------------------Mode: single_gpu------------------------------------------ Success: ✅ ---------------------------------------------Mode: fsdp--------------------------------------------- NCCL version 2.19.3+cuda12.1 [rank0]:[2023-12-15 14:49:02,616] [0/0] torch._dynamo.variables.torch: [WARNING] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored [rank0]:[2023-12-15 14:49:02,618] [0/0] torch._dynamo.variables.torch: [WARNING] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored [rank1]:[2023-12-15 14:49:02,706] [0/0] torch._dynamo.variables.torch: [WARNING] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored [rank1]:[2023-12-15 14:49:02,708] [0/0] torch._dynamo.variables.torch: [WARNING] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored -------------------------------------------Mode: analyze-------------------------------------------- output testing single_gpu vs FSDP success state dict testing single_gpu vs FSDP success Success: ✅ ✅ All Tests Passed ✅ ``` Pull Request resolved: #160 Reviewed By: vkuzo Differential Revision: D52224302 Pulled By: y-sq fbshipit-source-id: 4c29479771f4cd100b8c5a9549d321eb13b49739
diff --git a/test/test_fsdp.py b/test/test_fsdp.py
@@ -46,8 +46,7 @@
 
 B, M, K, N = 8, 8, 32, 32
 lr = 0.01
-N_ITER = 3
-N_ITER = 1
+N_ITER = 5
 
 
 def setup(rank, world_size):
@@ -65,7 +64,9 @@ def cleanup():
 def get_model(K, N, is_fp8, emulate, base_dtype=torch.float32):
     m = nn.Sequential(
         nn.Linear(K, N, dtype=base_dtype),
+        nn.ReLU(),
         nn.Linear(N, N, dtype=base_dtype),
+        nn.ReLU(),
     )
     if is_fp8:
         swap_linear_with_float8_linear(m, Float8Linear, emulate=emulate)
@@ -78,31 +79,52 @@ def fsdp_main(rank, world_size, args):
     setup(rank, world_size)
     torch.cuda.set_device(rank)
 
-    is_fp8, emulate, base_dtype = args
+    # TODO: We set fullgraph as an option. However, it currently doesn't work for fullgraph compile.
+    # We can investigate and fix it later.
+    is_fp8, emulate, base_dtype, compile, fullgraph = args
     model = get_model(K, N, is_fp8=is_fp8, emulate=emulate, base_dtype=base_dtype).to(
         rank
     )
     model.load_state_dict(torch.load(sd_in_fname))
-    model = FSDP(model)
+    # To compile FSDP, we need use_orig_params to True
+    model = FSDP(model, use_orig_params=True)
+    # TODO: The following line doesn't work. We should fix it.
+    # model = FSDP(torch.compile(model), use_orig_params=True)
+
     # Note: we need to multiply by world_size here to match single GPU
     # optimizer update
     optimizer = torch.optim.SGD(model.parameters(), lr=lr * world_size)
 
     ref_input_global = torch.load(input_fname).to(base_dtype)
 
     # basic distributed data sampling
-    bsz_global = ref_input_global.shape[0]
     assert B % world_size == 0
     bsz_local_start = int(rank / world_size * B)
     bsz_local_end = int((rank + 1) / world_size * B)
     ref_input_local = ref_input_global[bsz_local_start:bsz_local_end].to(rank)
 
-    for _ in range(N_ITER):
+    sync_float8_func = sync_float8_amax_and_scale_history
+    if compile:
+        sync_float8_func = torch.compile(
+            sync_float8_amax_and_scale_history, fullgraph=fullgraph
+        )
+
+    def forward_backward(model):
         optimizer.zero_grad()
         y_local = model(ref_input_local)
         y_local.sum().backward()
-        sync_float8_amax_and_scale_history(model)
+        sync_float8_func(model)
         optimizer.step()
+        return y_local
+
+    for iter in range(N_ITER):
+        # We first run one iteration without compile, as a workaround to compile float8 layer.
+        # In the first iter, float8 layers go to the branches of "self.is_amax_initialized == False"
+        # After that, float8 layers go the the branches of "self.is_amax_initialized == True"
+        # TODO: Need to fix compile to run wihtout this workaround.
+        if iter == 1 and compile:
+            model = torch.compile(model, fullgraph=fullgraph)
+        y_local = forward_backward(model)
 
     # get global y
     y_global = [
@@ -126,7 +148,7 @@ def fsdp_main(rank, world_size, args):
     cleanup()
 
 
-def run(mode: str, is_fp8: bool):
+def run(mode: str, is_fp8: bool, compile_fsdp: bool = False, fullgraph: bool = False):
     print(f"Mode: {mode}".center(100, "-"))
     base_dtype = torch.bfloat16
     if not os.path.exists(data_dir):
@@ -160,19 +182,24 @@ def run(mode: str, is_fp8: bool):
         model.load_state_dict(torch.load(sd_in_fname))
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
-        for _ in range(N_ITER):
+        def forward_backward():
             optimizer.zero_grad()
             y = model(ref_input)
             y.sum().backward()
             sync_float8_amax_and_scale_history(model)
             optimizer.step()
+            return y
+
+        for _ in range(N_ITER):
+            y = forward_backward()
 
         torch.save(y, output_single_gpu_fname)
         torch.save(model.state_dict(), sd_out_single_gpu_fname)
 
     elif mode == "fsdp":
         WORLD_SIZE = torch.cuda.device_count()
-        args = (is_fp8, emulate, base_dtype)
+        # We only compile for fsdp, and compare the numerics with signle-gpu no-compile
+        args = (is_fp8, emulate, base_dtype, compile_fsdp, fullgraph)
         mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)
 
     elif mode == "analyze":
diff --git a/test/test_fsdp.sh b/test/test_fsdp.sh
@@ -4,14 +4,14 @@
 set -e
 
 launch() {
-    echo "launching IS_FP8 $IS_FP8"
+    echo "launching IS_FP8 $IS_FP8, compile_fsdp $COMPILE, fullgraph $FULLGRAPH"
 
     # generate the test data
-    python test/test_fsdp.py --mode generate --is_fp8 $IS_FP8
+    python test/test_fsdp.py --mode generate --is_fp8 $IS_FP8 --compile_fsdp $COMPILE --fullgraph $FULLGRAPH
     echo "Success: ✅"
 
     # generate single GPU model output and updated state dict
-    python test/test_fsdp.py --mode single_gpu --is_fp8 $IS_FP8
+    python test/test_fsdp.py --mode single_gpu --is_fp8 $IS_FP8 --compile_fsdp $COMPILE --fullgraph $FULLGRAPH
     echo "Success: ✅"
 
     # generate FSDP model output and updated state dict
@@ -20,16 +20,19 @@ launch() {
     # the NCCL_NET setting is to work around transient issues on a
     #   specific host (`devgpu001.nha2`)
     NCCL_DEBUG=WARN CUDA_VISIBLE_DEVICES=0,1 NCCL_NET=SOCKET python test/test_fsdp.py \
-        --mode fsdp --is_fp8 $IS_FP8
+        --mode fsdp --is_fp8 $IS_FP8 --compile_fsdp $COMPILE --fullgraph $FULLGRAPH
 
     # compare the outputs and state dicts and verify equivalence
-    python test/test_fsdp.py --mode analyze --is_fp8 $IS_FP8
+    python test/test_fsdp.py --mode analyze --is_fp8 $IS_FP8 --compile_fsdp $COMPILE --fullgraph $FULLGRAPH
     echo "Success: ✅"
 
     echo "✅ All Tests Passed ✅"
 }
 
-for IS_FP8 in False True
+# IS_FP8, COMPILE, FULLGRAPH
+for i in False,False,False True,False,False True,True,False
 do
+    IFS=","; set -- $i;
+    IS_FP8=$1; COMPILE=$2; FULLGRAPH=$3
     launch
 done