[feat] Add pytorchvideo encoder wrapper

Ryan-Qiyu-Jiang · Ryan-Qiyu-Jiang · commit 81e2e43b1234 · 2021-11-23T14:56:06.000-08:00
Add an encoder class that constructs any pytorchvideo model from config, and uses this model for its forward pass. Can load pretrained or random init models, based on config. ghstack-source-id: 2eb33e0 Pull Request resolved: #1156
diff --git a/mmf/modules/encoders.py b/mmf/modules/encoders.py
@@ -1,10 +1,13 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
+import importlib
+import inspect
+import logging
 import os
 import pickle
 import re
 from collections import OrderedDict
 from copy import deepcopy
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from enum import Enum
 from typing import Any
 
@@ -25,13 +28,15 @@
 from transformers.configuration_auto import AutoConfig
 from transformers.modeling_auto import AutoModel
 
-
 try:
     from detectron2.modeling import ShapeSpec, build_resnet_backbone
 except ImportError:
     pass
 
 
+logger = logging.getLogger()
+
+
 class Encoder(nn.Module):
     @dataclass
     class Config:
@@ -688,6 +693,86 @@ def forward(self, x: Tensor) -> Tensor:
         return out
 
 
+@registry.register_encoder("torchvideo")
+class TorchVideoEncoder(Encoder):
+    """
+    Wrapper around importing torchvideo models
+    as encoders.
+    """
+
+    @dataclass
+    class Config(Encoder.Config):
+        name: str = "torchvideo"
+        random_init: bool = False
+        model_name: str = "slowfast_r50"
+        cls_layer_num: int = 1
+
+    def __init__(self, config: Config):
+        pytorchvideo_spec = importlib.util.find_spec("pytorchvideo")
+        if pytorchvideo_spec is None:
+            raise ImportError("pytorchvideo required for using TorchVideoEncoder")
+        import pytorchvideo.models as models
+
+        super().__init__()
+        config = OmegaConf.create({**asdict(self.Config()), **config})
+        if config.random_init:
+            model_create_fn_name = f"create_{config.model_name}"
+            model_create_fn = getattr(models, model_create_fn_name)
+            params = dict(**config)
+            params.pop("random_init")
+            params.pop("model_name")
+            params.pop("cls_layer_num")
+
+            accepted_params, ignored_params = self.filter_dict_to_signature(
+                model_create_fn, params
+            )
+            if ignored_params:
+                ignored_params_str = " ".join(ignored_params.keys())
+                logger.warning(
+                    "The following model constructor params were ignored"
+                    + " because they don't match a named param in the constructor: "
+                    + ignored_params_str
+                )
+            model = model_create_fn(**accepted_params)
+        else:
+            # load weights from TorchHub
+            model = torch.hub.load(
+                "facebookresearch/pytorchvideo:main",
+                model=config.model_name,
+                pretrained=True,
+            )
+
+        if config.cls_layer_num == 0:
+            self.encoder = model
+            return
+
+        modules_list = list(model.children())
+        if len(modules_list) == 1:
+            modules_list = list(modules_list[0].children())
+        modules = modules_list[: -config.cls_layer_num]
+        self.encoder = nn.Sequential(*modules)
+
+    def forward(self, *args, **kwargs):
+        # pass along input to model
+        # assumes caller obeys the dynamic model signature
+        return self.encoder(*args, **kwargs)
+
+    def filter_dict_to_signature(self, callable, params):
+        constructor_signature = inspect.signature(callable)  # Signature obj
+        constructor_params = constructor_signature.parameters
+        accepted_params = {
+            param_name: params[param_name]
+            for param_name in params
+            if param_name in constructor_params
+        }
+        ignored_params = {
+            param_name: params[param_name]
+            for param_name in params
+            if param_name not in constructor_params
+        }
+        return accepted_params, ignored_params
+
+
 @registry.register_encoder("r2plus1d_18")
 class R2Plus1D18VideoEncoder(PooledEncoder):
     """
diff --git a/requirements.txt b/requirements.txt
@@ -22,3 +22,5 @@ pytorch-lightning @ git+https://github.com/PyTorchLightning/pytorch-lightning@fa
 torchaudio>=0.6.0, <=0.9.0
 psutil
 pillow==8.3.1
+av>=8.0.3
+pytorchvideo>=0.1.3
diff --git a/tests/modules/test_encoders.py b/tests/modules/test_encoders.py
@@ -6,7 +6,11 @@
 import torch
 from mmf.modules import encoders
 from omegaconf import OmegaConf
-from tests.test_utils import setup_proxy, skip_if_old_transformers
+from tests.test_utils import (
+    setup_proxy,
+    skip_if_old_transformers,
+    skip_if_no_pytorchvideo,
+)
 from torch import nn
 
 
@@ -102,3 +106,30 @@ def test_vit_encoder(self):
         x = torch.rand(32, 197, 768)
         output, _ = encoder(x)
         self.assertEqual(output.size(-1), config.out_dim)
+
+    @skip_if_no_pytorchvideo
+    def test_torchvision_slowfast_r50_encoder(self):
+        config = OmegaConf.structured(encoders.TorchVideoEncoder.Config())
+        encoder = encoders.TorchVideoEncoder(config)
+        fast = torch.rand((1, 3, 32, 224, 224))
+        slow = torch.rand((1, 3, 8, 224, 224))
+        output = encoder([slow, fast])
+        self.assertEqual(output.size(1), 2304)
+
+    @skip_if_no_pytorchvideo
+    def test_torchvision_mvit_encoder(self):
+        config = OmegaConf.create(
+            {
+                "name": "torchvideo",
+                "model_name": "multiscale_vision_transformers",
+                "random_init": True,
+                "cls_layer_num": 0,
+                "spatial_size": 224,
+                "temporal_size": 8,
+                "head": None,
+            }
+        )
+        encoder = encoders.TorchVideoEncoder(config)
+        x = torch.rand((1, 3, 8, 224, 224))
+        output = encoder(x)
+        self.assertEqual(output.shape, torch.Size([1, 12545, 96]))
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -102,6 +102,13 @@ def wrap(testfn, reason="Requires newer version of transformers"):
     return wrap
 
 
+def skip_if_no_pytorchvideo(testfn, reason="Requires pytorchvideo"):
+    import importlib
+
+    pytorchvideo_spec = importlib.util.find_spec("pytorchvideo")
+    return unittest.skipUnless(pytorchvideo_spec is not None, reason)(testfn)
+
+
 def compare_state_dicts(a, b):
     same = True
     same = same and (list(a.keys()) == list(b.keys()))