From f7a241655860d2465d223552b9f1cb4656052b6b Mon Sep 17 00:00:00 2001
From: Luciferian Ink <LuciferianInk@protonmail.com>
Date: Sun, 25 Feb 2024 08:59:43 -0600
Subject: [PATCH 1/3] fix doc

---
 moduleformer/utils/moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moduleformer/utils/moe.py b/moduleformer/utils/moe.py
index a8d802a..2ec1797 100644
--- a/moduleformer/utils/moe.py
+++ b/moduleformer/utils/moe.py
@@ -26,7 +26,7 @@ class MoE(nn.Module):
         gating_dropout: a float - dropout rate for gating network
         sample_topk: an integer - how many experts to sample during training
         gating_size: an integer - size of the gating network
-        aux_loss: a string - type of auxiliary loss ('mi' or 'sparse')
+        aux_loss: a string - type of auxiliary loss ('mi' or 'switch')
         gate_type: a string - type of gating mechanism ('mlp' or 'topk')
     """
 

From 3b256c9c03fa78634446f55825df96cdb1c2b4f7 Mon Sep 17 00:00:00 2001
From: Luciferian Ink <LuciferianInk@protonmail.com>
Date: Wed, 20 Mar 2024 10:43:47 -0500
Subject: [PATCH 2/3] implement layernorm eps argument

---
 moduleformer/configuration_moduleformer.py | 2 +-
 moduleformer/modeling_moduleformer.py      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/moduleformer/configuration_moduleformer.py b/moduleformer/configuration_moduleformer.py
index dbfa9b5..30595d3 100644
--- a/moduleformer/configuration_moduleformer.py
+++ b/moduleformer/configuration_moduleformer.py
@@ -125,7 +125,7 @@ def __init__(
         self.embd_pdrop = embd_pdrop
         self.attn_pdrop = attn_pdrop
         self.moe_pdrop = moe_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
+        self.layer_norm_epsilon = float(layer_norm_epsilon)
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.sample_topk = sample_topk
diff --git a/moduleformer/modeling_moduleformer.py b/moduleformer/modeling_moduleformer.py
index 7b914a1..d49a6a1 100644
--- a/moduleformer/modeling_moduleformer.py
+++ b/moduleformer/modeling_moduleformer.py
@@ -207,9 +207,9 @@ def __init__(self, config):
             config: Configuration object with model hyperparameters.
         """
         super().__init__()
-        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.attn = ModuleFormerAttention(config)
-        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.mlpf = MoE(
                 input_size=config.n_embd, 
                 head_size=config.ffd_hidden, 
@@ -425,7 +425,7 @@ def __init__(self, config):
         self.wte = nn.Embedding(config.vocab_size, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([ModuleFormerBlock(config) for _ in range(config.n_layer)])
-        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         # Initialize weights and apply final processing
         self.post_init()

From 22fc34ae482e6276ba3d112f74b9bca0500ca326 Mon Sep 17 00:00:00 2001
From: Luciferian Ink <LuciferianInk@protonmail.com>
Date: Wed, 27 Mar 2024 07:56:25 -0500
Subject: [PATCH 3/3] add the option to acc_aux_loss

---
 moduleformer/configuration_moduleformer.py | 4 ++--
 moduleformer/modeling_moduleformer.py      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/moduleformer/configuration_moduleformer.py b/moduleformer/configuration_moduleformer.py
index 30595d3..5dce6b0 100644
--- a/moduleformer/configuration_moduleformer.py
+++ b/moduleformer/configuration_moduleformer.py
@@ -39,8 +39,6 @@ class ModuleFormerConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        rotary_dim (`int`, *optional*, defaults to 64):
-            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
         n_inner (`int`, *optional*, defaults to None):
             Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
         activation_function (`str`, *optional*, defaults to `"gelu_new"`):
@@ -109,6 +107,7 @@ def __init__(
         tie_word_embeddings=False,
         aux_loss_type = 'mi',
         aux_loss_weight=0,
+        acc_aux_loss=False,
         gate_type = "mlp",
         **kwargs,
     ):
@@ -136,6 +135,7 @@ def __init__(
         self.k_mlp = k_mlp
         self.aux_loss_type = aux_loss_type
         self.aux_loss_weight = aux_loss_weight
+        self.acc_aux_loss = acc_aux_loss
         self.gate_type = gate_type
         self.n_ctx = history_length * n_layer
 
diff --git a/moduleformer/modeling_moduleformer.py b/moduleformer/modeling_moduleformer.py
index d49a6a1..5d2ad15 100644
--- a/moduleformer/modeling_moduleformer.py
+++ b/moduleformer/modeling_moduleformer.py
@@ -82,7 +82,7 @@ def __init__(self, config):
                 head_size=config.att_hidden, 
                 num_experts=config.n_att_experts, 
                 top_k=config.k_att,
-                acc_aux_loss=False, 
+                acc_aux_loss=config.acc_aux_loss, 
                 bias=False,
                 gating_dropout=config.moe_pdrop,
                 sample_topk=config.sample_topk,
@@ -217,7 +217,7 @@ def __init__(self, config):
                 top_k=config.k_mlp, 
                 bias=False, 
                 activation=get_activation(config.activation_function),
-                acc_aux_loss=False,
+                acc_aux_loss=config.acc_aux_loss,
                 gating_dropout=config.moe_pdrop,
                 sample_topk=config.sample_topk,
                 gating_size=config.gating_size,