IBM · Vectorrent · Feb 25, 2024 · Mar 20, 2024 · Mar 27, 2024
diff --git a/moduleformer/configuration_moduleformer.py b/moduleformer/configuration_moduleformer.py
@@ -39,8 +39,6 @@ class ModuleFormerConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        rotary_dim (`int`, *optional*, defaults to 64):
-            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
         n_inner (`int`, *optional*, defaults to None):
             Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
         activation_function (`str`, *optional*, defaults to `"gelu_new"`):
@@ -109,6 +107,7 @@ def __init__(
         tie_word_embeddings=False,
         aux_loss_type = 'mi',
         aux_loss_weight=0,
+        acc_aux_loss=False,
         gate_type = "mlp",
         **kwargs,
     ):
@@ -125,7 +124,7 @@ def __init__(
         self.embd_pdrop = embd_pdrop
         self.attn_pdrop = attn_pdrop
         self.moe_pdrop = moe_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
+        self.layer_norm_epsilon = float(layer_norm_epsilon)
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.sample_topk = sample_topk
@@ -136,6 +135,7 @@ def __init__(
         self.k_mlp = k_mlp
         self.aux_loss_type = aux_loss_type
         self.aux_loss_weight = aux_loss_weight
+        self.acc_aux_loss = acc_aux_loss
         self.gate_type = gate_type
         self.n_ctx = history_length * n_layer
 

diff --git a/moduleformer/modeling_moduleformer.py b/moduleformer/modeling_moduleformer.py
@@ -82,7 +82,7 @@ def __init__(self, config):
                 head_size=config.att_hidden, 
                 num_experts=config.n_att_experts, 
                 top_k=config.k_att,
-                acc_aux_loss=False, 
+                acc_aux_loss=config.acc_aux_loss, 
                 bias=False,
                 gating_dropout=config.moe_pdrop,
                 sample_topk=config.sample_topk,
@@ -207,17 +207,17 @@ def __init__(self, config):
             config: Configuration object with model hyperparameters.
         """
         super().__init__()
-        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.attn = ModuleFormerAttention(config)
-        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.mlpf = MoE(
                 input_size=config.n_embd, 
                 head_size=config.ffd_hidden, 
                 num_experts=config.n_mlp_experts, 
                 top_k=config.k_mlp, 
                 bias=False, 
                 activation=get_activation(config.activation_function),
-                acc_aux_loss=False,
+                acc_aux_loss=config.acc_aux_loss,
                 gating_dropout=config.moe_pdrop,
                 sample_topk=config.sample_topk,
                 gating_size=config.gating_size,
@@ -425,7 +425,7 @@ def __init__(self, config):
         self.wte = nn.Embedding(config.vocab_size, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([ModuleFormerBlock(config) for _ in range(config.n_layer)])
-        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         # Initialize weights and apply final processing
         self.post_init()

diff --git a/moduleformer/utils/moe.py b/moduleformer/utils/moe.py
@@ -26,7 +26,7 @@ class MoE(nn.Module):
         gating_dropout: a float - dropout rate for gating network
         sample_topk: an integer - how many experts to sample during training
         gating_size: an integer - size of the gating network
-        aux_loss: a string - type of auxiliary loss ('mi' or 'sparse')
+        aux_loss: a string - type of auxiliary loss ('mi' or 'switch')
         gate_type: a string - type of gating mechanism ('mlp' or 'topk')
     """