From f7a241655860d2465d223552b9f1cb4656052b6b Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Sun, 25 Feb 2024 08:59:43 -0600 Subject: [PATCH 1/3] fix doc --- moduleformer/utils/moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moduleformer/utils/moe.py b/moduleformer/utils/moe.py index a8d802a..2ec1797 100644 --- a/moduleformer/utils/moe.py +++ b/moduleformer/utils/moe.py @@ -26,7 +26,7 @@ class MoE(nn.Module): gating_dropout: a float - dropout rate for gating network sample_topk: an integer - how many experts to sample during training gating_size: an integer - size of the gating network - aux_loss: a string - type of auxiliary loss ('mi' or 'sparse') + aux_loss: a string - type of auxiliary loss ('mi' or 'switch') gate_type: a string - type of gating mechanism ('mlp' or 'topk') """ From 3b256c9c03fa78634446f55825df96cdb1c2b4f7 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Wed, 20 Mar 2024 10:43:47 -0500 Subject: [PATCH 2/3] implement layernorm eps argument --- moduleformer/configuration_moduleformer.py | 2 +- moduleformer/modeling_moduleformer.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/moduleformer/configuration_moduleformer.py b/moduleformer/configuration_moduleformer.py index dbfa9b5..30595d3 100644 --- a/moduleformer/configuration_moduleformer.py +++ b/moduleformer/configuration_moduleformer.py @@ -125,7 +125,7 @@ def __init__( self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.moe_pdrop = moe_pdrop - self.layer_norm_epsilon = layer_norm_epsilon + self.layer_norm_epsilon = float(layer_norm_epsilon) self.initializer_range = initializer_range self.use_cache = use_cache self.sample_topk = sample_topk diff --git a/moduleformer/modeling_moduleformer.py b/moduleformer/modeling_moduleformer.py index 7b914a1..d49a6a1 100644 --- a/moduleformer/modeling_moduleformer.py +++ b/moduleformer/modeling_moduleformer.py @@ -207,9 +207,9 @@ def __init__(self, config): config: Configuration object with model hyperparameters. """ super().__init__() - self.ln_1 = nn.LayerNorm(config.n_embd) + self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = ModuleFormerAttention(config) - self.ln_2 = nn.LayerNorm(config.n_embd) + self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.mlpf = MoE( input_size=config.n_embd, head_size=config.ffd_hidden, @@ -425,7 +425,7 @@ def __init__(self, config): self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([ModuleFormerBlock(config) for _ in range(config.n_layer)]) - self.ln_f = nn.LayerNorm(config.n_embd) + self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) # Initialize weights and apply final processing self.post_init() From 22fc34ae482e6276ba3d112f74b9bca0500ca326 Mon Sep 17 00:00:00 2001 From: Luciferian Ink Date: Wed, 27 Mar 2024 07:56:25 -0500 Subject: [PATCH 3/3] add the option to acc_aux_loss --- moduleformer/configuration_moduleformer.py | 4 ++-- moduleformer/modeling_moduleformer.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/moduleformer/configuration_moduleformer.py b/moduleformer/configuration_moduleformer.py index 30595d3..5dce6b0 100644 --- a/moduleformer/configuration_moduleformer.py +++ b/moduleformer/configuration_moduleformer.py @@ -39,8 +39,6 @@ class ModuleFormerConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. n_head (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. - rotary_dim (`int`, *optional*, defaults to 64): - Number of dimensions in the embedding that Rotary Position Embedding is applied to. n_inner (`int`, *optional*, defaults to None): Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd activation_function (`str`, *optional*, defaults to `"gelu_new"`): @@ -109,6 +107,7 @@ def __init__( tie_word_embeddings=False, aux_loss_type = 'mi', aux_loss_weight=0, + acc_aux_loss=False, gate_type = "mlp", **kwargs, ): @@ -136,6 +135,7 @@ def __init__( self.k_mlp = k_mlp self.aux_loss_type = aux_loss_type self.aux_loss_weight = aux_loss_weight + self.acc_aux_loss = acc_aux_loss self.gate_type = gate_type self.n_ctx = history_length * n_layer diff --git a/moduleformer/modeling_moduleformer.py b/moduleformer/modeling_moduleformer.py index d49a6a1..5d2ad15 100644 --- a/moduleformer/modeling_moduleformer.py +++ b/moduleformer/modeling_moduleformer.py @@ -82,7 +82,7 @@ def __init__(self, config): head_size=config.att_hidden, num_experts=config.n_att_experts, top_k=config.k_att, - acc_aux_loss=False, + acc_aux_loss=config.acc_aux_loss, bias=False, gating_dropout=config.moe_pdrop, sample_topk=config.sample_topk, @@ -217,7 +217,7 @@ def __init__(self, config): top_k=config.k_mlp, bias=False, activation=get_activation(config.activation_function), - acc_aux_loss=False, + acc_aux_loss=config.acc_aux_loss, gating_dropout=config.moe_pdrop, sample_topk=config.sample_topk, gating_size=config.gating_size,