parmetrize compile and base tests

drisspg · drisspg · commit 177173a6f60f · 2024-01-17T15:05:33.000-08:00
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -24,13 +24,17 @@ class LinearType(Enum):
 
 
 def get_float8_linear(
-    linear_type: LinearType, linear_ref: torch.nn.Linear, emulate: bool = False
+    linear_type: LinearType,
+    linear_ref: torch.nn.Linear,
+    emulate: bool = False,
+    recompute_weight_cast: bool = False,
 ):
     """Returns a Float8Linear module of the given type, initialized from linear_ref.
     Args:
         linear_type: The type of Float8Linear to return.
         linear_ref: The linear module to initialize from.
         emulate: Whether to emulate the fp8 matmul logic in float32.
+        recompute_weight_cast: Whether to recompute the weight cast in the backwards pass.
     """
     LINEAR_TYPE_MAP = {
         LinearType.DELAYED: Float8Linear,
@@ -40,7 +44,9 @@ def get_float8_linear(
         raise ValueError(f"linear_type must be one of {LINEAR_TYPE_MAP.keys()}")
 
     return LINEAR_TYPE_MAP[linear_type].from_float(
-        copy.deepcopy(linear_ref), emulate=emulate
+        copy.deepcopy(linear_ref),
+        emulate=emulate,
+        recompute_weight_cast=recompute_weight_cast,
     )
 
 
diff --git a/test/test_base.py b/test/test_base.py
@@ -50,8 +50,15 @@ def test_preserves_dtype(self) -> None:
 
 
 class TestFloat8Linear:
-    def _test_linear_impl(self, x, m_ref, linear_type: LinearType, emulate: bool):
-        m_fp8 = get_float8_linear(linear_type, m_ref, emulate)
+    def _test_linear_impl(
+        self,
+        x,
+        m_ref,
+        linear_type: LinearType,
+        emulate: bool,
+        recompute_weight_cast: bool,
+    ):
+        m_fp8 = get_float8_linear(linear_type, m_ref, emulate, recompute_weight_cast)
         for _ in range(2):
             if linear_requires_sync(linear_type):
                 sync_float8_amax_and_scale_history(m_fp8)
@@ -112,7 +119,14 @@ def _test_linear_impl(self, x, m_ref, linear_type: LinearType, emulate: bool):
     @pytest.mark.parametrize("emulate", [True, False])
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
-    def test_linear_nobias(self, x_shape, linear_type: LinearType, emulate: bool):
+    @pytest.mark.parametrize("recompute_weight_cast", [True, False])
+    def test_linear_nobias(
+        self,
+        x_shape,
+        linear_type: LinearType,
+        emulate: bool,
+        recompute_weight_cast: bool,
+    ):
         if not emulate:
             if not torch.cuda.is_available():
                 warnings.warn("CUDA not available")
@@ -125,16 +139,22 @@ def test_linear_nobias(self, x_shape, linear_type: LinearType, emulate: bool):
 
         x = torch.randn(*x_shape, device="cuda")
         m_ref = nn.Linear(16, 32, bias=False, device="cuda")
-        self._test_linear_impl(x, m_ref, linear_type, emulate)
+        self._test_linear_impl(x, m_ref, linear_type, emulate, recompute_weight_cast)
 
     @pytest.mark.parametrize("emulate", [True, False])
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
+    @pytest.mark.parametrize("recompute_weight_cast", [True, False])
     def test_linear_bias(
-        self, x_shape, linear_type: LinearType, emulate: bool, linear_dtype: torch.dtype
+        self,
+        x_shape,
+        linear_type: LinearType,
+        emulate: bool,
+        linear_dtype: torch.dtype,
+        recompute_weight_cast: bool,
     ):
         if not emulate:
             if not torch.cuda.is_available():
@@ -148,10 +168,10 @@ def test_linear_bias(
 
         x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
         m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
-        self._test_linear_impl(x, m_ref, linear_type, emulate)
+        self._test_linear_impl(x, m_ref, linear_type, emulate, recompute_weight_cast)
 
         m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
-        m = Float8Linear.from_float(m, emulate)
+        m = Float8Linear.from_float(m, emulate, recompute_weight_cast)
 
         # autocast off
         x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
@@ -184,7 +204,7 @@ def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
 
         x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
         m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
-        self._test_linear_impl(x, m_ref, linear_type, emulate)
+        self._test_linear_impl(x, m_ref, linear_type, emulate, False)
 
         m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
         m = Float8Linear.from_float(m, emulate)
diff --git a/test/test_compile.py b/test/test_compile.py
@@ -22,6 +22,7 @@ def _test_compile_base(
     emulate: bool,
     linear_type: LinearType,
     dtype: torch.dtype,
+    recompute_weight_cast: bool,
 ):
     random.seed(0)
     torch.manual_seed(0)
@@ -31,7 +32,9 @@ def _test_compile_base(
     x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
     m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
 
-    m_fp8 = get_float8_linear(linear_type, m_ref, emulate=emulate)
+    m_fp8 = get_float8_linear(
+        linear_type, m_ref, emulate=emulate, recompute_weight_cast=recompute_weight_cast
+    )
 
     m_fp8 = torch.compile(m_fp8, backend=backend, fullgraph=fullgraph)
     m_ref = torch.compile(m_ref, backend=backend, fullgraph=fullgraph)
@@ -50,30 +53,57 @@ def _test_compile_base(
 @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
 @pytest.mark.parametrize("emulate", [False, True] if is_H100 else [True])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+@pytest.mark.parametrize("recompute_weight_cast", [False, True])
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
-def test_eager_only(fullgraph, emulate: bool, linear_type: bool, dtype: torch.dtype):
+def test_eager_only(
+    fullgraph,
+    emulate: bool,
+    linear_type: bool,
+    dtype: torch.dtype,
+    recompute_weight_cast: bool,
+):
     torch._dynamo.reset()
-    _test_compile_base("eager", fullgraph, emulate, linear_type, dtype)
+    _test_compile_base(
+        "eager", fullgraph, emulate, linear_type, dtype, recompute_weight_cast
+    )
 
 
 @pytest.mark.parametrize("fullgraph", [True])
 @pytest.mark.parametrize("emulate", [False, True] if is_H100 else [True])
 @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+@pytest.mark.parametrize("recompute_weight_cast", [False, True])
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
-def test_aot_eager(fullgraph, emulate: bool, linear_type: bool, dtype: torch.dtype):
+def test_aot_eager(
+    fullgraph,
+    emulate: bool,
+    linear_type: bool,
+    dtype: torch.dtype,
+    recompute_weight_cast: bool,
+):
     torch._dynamo.reset()
-    _test_compile_base("aot_eager", fullgraph, emulate, linear_type, dtype)
+    _test_compile_base(
+        "aot_eager", fullgraph, emulate, linear_type, dtype, recompute_weight_cast
+    )
 
 
 @pytest.mark.parametrize("fullgraph", [True])
 @pytest.mark.parametrize("emulate", [False])
 @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
+@pytest.mark.parametrize("recompute_weight_cast", [False, True])
 @unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA not available")
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
-def test_inductor(fullgraph, emulate: bool, linear_type: bool, dtype: torch.dtype):
+def test_inductor(
+    fullgraph,
+    emulate: bool,
+    linear_type: bool,
+    dtype: torch.dtype,
+    recompute_weight_cast: bool,
+):
     torch._dynamo.reset()
-    _test_compile_base("inductor", fullgraph, emulate, linear_type, dtype)
+    _test_compile_base(
+        "inductor", fullgraph, emulate, linear_type, dtype, recompute_weight_cast
+    )
 
 
 if __name__ == "__main__":