wangjiawen2013
diff --git a/‎lstm.py
+266 b/‎lstm.py
+266
@@ -0,0 +1,266 @@
+import torch
+import torch.nn as nn
+import math
+from typing import Tuple, List, Optional
+
+class LSTMCell(nn.Module):
+    """
+    LSTM Cell implementation with layer normalization.
+    
+    Mathematical formulation of LSTM:
+    f_t = σ(W_f · [h_{t-1}, x_t] + b_f)    # Forget gate
+    i_t = σ(W_i · [h_{t-1}, x_t] + b_i)    # Input gate
+    g_t = tanh(W_g · [h_{t-1}, x_t] + b_g)  # Candidate cell state
+    o_t = σ(W_o · [h_{t-1}, x_t] + b_o)    # Output gate
+    
+    c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t       # New cell state
+    h_t = o_t ⊙ tanh(c_t)                  # New hidden state
+    
+    where:
+    - σ is the sigmoid function
+    - ⊙ is element-wise multiplication
+    - [h_{t-1}, x_t] represents concatenation
+    """
+    def __init__(self, input_size: int, hidden_size: int, dropout: float = 0.0):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+        
+        # Combined weight matrices for efficiency
+        # W_ih combines weights for [i_t, f_t, g_t, o_t] for input x_t
+        # W_hh combines weights for [i_t, f_t, g_t, o_t] for hidden state h_{t-1}
+        self.weight_ih = nn.Linear(input_size, 4 * hidden_size)
+        self.weight_hh = nn.Linear(hidden_size, 4 * hidden_size)
+        
+        # Layer Normalization for better training stability
+        self.layer_norm_x = nn.LayerNorm(4 * hidden_size)  # Normalize gate pre-activations
+        self.layer_norm_h = nn.LayerNorm(hidden_size)      # Normalize hidden state
+        self.layer_norm_c = nn.LayerNorm(hidden_size)      # Normalize cell state
+
+        self.init_parameters()
+
+    def init_parameters(self) -> None:
+        """
+        Initialize parameters using best practices:
+        1. Orthogonal initialization for better gradient flow
+        2. Initialize forget gate bias to 1.0 to prevent forgetting at start of training
+        """
+        for weight in [self.weight_ih.weight, self.weight_hh.weight]:
+            nn.init.orthogonal_(weight)
+        
+        # Set forget gate bias to 1.0 (helps with learning long sequences)
+        nn.init.constant_(self.weight_ih.bias[self.hidden_size:2*self.hidden_size], 1.0)
+        nn.init.constant_(self.weight_hh.bias[self.hidden_size:2*self.hidden_size], 1.0)
+
+    def forward(self, x: torch.Tensor, 
+                hidden_state: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass of LSTM cell.
+        
+        Args:
+            x: Input tensor of shape (batch_size, input_size)
+            hidden_state: Tuple of (h_{t-1}, c_{t-1}) each of shape (batch_size, hidden_size)
+        
+        Returns:
+            Tuple of (h_t, c_t) representing new hidden and cell states
+        """
+        h_prev, c_prev = hidden_state
+        
+        # Combined matrix multiplication for all gates
+        # Shape: (batch_size, 4 * hidden_size)
+        gates_x = self.weight_ih(x)          # Transform input
+        gates_h = self.weight_hh(h_prev)     # Transform previous hidden state
+        
+        # Apply layer normalization
+        gates_x = self.layer_norm_x(gates_x)
+        gates = gates_x + gates_h  # Combined gate pre-activations
+        
+        # Split into individual gates
+        # Each gate shape: (batch_size, hidden_size)
+        i_gate, f_gate, g_gate, o_gate = gates.chunk(4, dim=1)
+        
+        # Apply gate non-linearities
+        i_t = torch.sigmoid(i_gate)  # Input gate
+        f_t = torch.sigmoid(f_gate)  # Forget gate
+        g_t = torch.tanh(g_gate)     # Cell state candidate
+        o_t = torch.sigmoid(o_gate)  # Output gate
+        
+        # Update cell state: c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t
+        c_t = f_t * c_prev + i_t * g_t
+        c_t = self.layer_norm_c(c_t)
+        
+        # Update hidden state: h_t = o_t ⊙ tanh(c_t)
+        h_t = o_t * torch.tanh(c_t)
+        h_t = self.layer_norm_h(h_t)
+        
+        if self.dropout is not None:
+            h_t = self.dropout(h_t)
+            
+        return h_t, c_t
+
+    def init_hidden(self, batch_size: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Initialize hidden state and cell state with zeros."""
+        return (torch.zeros(batch_size, self.hidden_size, device=device),
+                torch.zeros(batch_size, self.hidden_size, device=device))
+
+
+class StackedLSTM(nn.Module):
+    """
+    Stacked LSTM implementation supporting multiple layers.
+    Each layer processes the output of the previous layer.
+    """
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0.0):
+        super().__init__()
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        
+        # Create list of LSTM cells, one for each layer
+        self.layers = nn.ModuleList([
+            LSTMCell(input_size if i == 0 else hidden_size, hidden_size, 
+                    dropout if i < num_layers - 1 else 0.0)  # No dropout on last layer
+            for i in range(num_layers)
+        ])
+
+    def forward(self, x: torch.Tensor, 
+                hidden_states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        """
+        Process input sequence through stacked LSTM layers.
+        
+        Args:
+            x: Input tensor of shape (batch_size, seq_length, input_size)
+            hidden_states: Optional initial hidden states for each layer
+        
+        Returns:
+            Tuple of (output, hidden_states) where output has shape (batch_size, seq_length, hidden_size)
+        """
+        batch_size, seq_length, _ = x.size()
+        device = x.device
+        
+        if hidden_states is None:
+            hidden_states = [layer.init_hidden(batch_size, device) for layer in self.layers]
+        
+        layer_outputs = []
+        for t in range(seq_length):
+            input_t = x[:, t, :]
+            for i, lstm_cell in enumerate(self.layers):
+                input_t, cell_state = lstm_cell(input_t, hidden_states[i])
+                hidden_states[i] = (input_t, cell_state)
+            layer_outputs.append(input_t)
+            
+        # Stack outputs along sequence dimension
+        output = torch.stack(layer_outputs, dim=1)
+        return output, hidden_states
+
+
+class LSTMNetwork(nn.Module):
+    """
+    Complete LSTM network with bidirectional support.
+    
+    In bidirectional mode:
+    - Forward LSTM processes sequence from left to right
+    - Backward LSTM processes sequence from right to left
+    - Outputs are concatenated for final prediction
+    """
+    def __init__(self, 
+                 input_size: int, 
+                 hidden_size: int, 
+                 num_layers: int, 
+                 output_size: int, 
+                 dropout: float = 0.0,
+                 bidirectional: bool = False):
+        super().__init__()
+        self.bidirectional = bidirectional
+        
+        # Forward direction LSTM
+        self.stacked_lstm = StackedLSTM(input_size, hidden_size, num_layers, dropout)
+        
+        # Optional backward direction LSTM for bidirectional processing
+        if bidirectional:
+            self.reverse_lstm = StackedLSTM(input_size, hidden_size, num_layers, dropout)
+            hidden_size *= 2  # Double hidden size due to concatenation
+            
+        self.fc = nn.Linear(hidden_size, output_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x: torch.Tensor, 
+                hidden_states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None) -> torch.Tensor:
+        """
+        Forward pass of the network.
+        
+        For bidirectional processing:
+        1. Process sequence normally with forward LSTM
+        2. Process reversed sequence with backward LSTM
+        3. Concatenate both outputs
+        4. Apply final linear transformation
+        
+        Args:
+            x: Input tensor of shape (batch_size, seq_length, input_size)
+            hidden_states: Optional initial hidden states
+            
+        Returns:
+            Output tensor of shape (batch_size, output_size)
+        """
+        # Forward direction
+        output, hidden_states = self.stacked_lstm(x, hidden_states)
+        
+        if self.bidirectional:
+            # Process sequence in reverse direction
+            reverse_output, _ = self.reverse_lstm(torch.flip(x, [1]))
+            # Flip back to align with forward sequence
+            reverse_output = torch.flip(reverse_output, [1])
+            # Concatenate forward and backward outputs along feature dimension
+            output = torch.cat([output, reverse_output], dim=-1)
+        
+        # Apply dropout before final layer
+        output = self.dropout(output)
+        # Use final timestep output for prediction
+        final_output = self.fc(output[:, -1, :])
+        return final_output
+
+
+def create_lstm_model(config: dict) -> LSTMNetwork:
+    """
+    Factory function to create an LSTM model with specified configuration.
+    
+    Args:
+        config: Dictionary containing model parameters:
+            - input_size: Size of input features
+            - hidden_size: Size of LSTM hidden state
+            - num_layers: Number of stacked LSTM layers
+            - output_size: Size of final output
+            - dropout: Dropout probability (optional)
+            - bidirectional: Whether to use bidirectional LSTM (optional)
+    """
+    return LSTMNetwork(
+        input_size=config['input_size'],
+        hidden_size=config['hidden_size'],
+        num_layers=config['num_layers'],
+        output_size=config['output_size'],
+        dropout=config.get('dropout', 0.0),
+        bidirectional=config.get('bidirectional', False)
+    )
+
+# Example usage
+if __name__ == "__main__":
+    # Configuration for a bidirectional LSTM
+    config = {
+        'input_size': 3,
+        'hidden_size': 64,
+        'num_layers': 2,
+        'output_size': 1,
+        'dropout': 0.3,
+        'bidirectional': True  # Enable bidirectional processing
+    }
+    
+    # Create model
+    model = create_lstm_model(config)
+    
+    # Generate dummy input
+    batch_size, seq_length = 32, 10
+    x = torch.randn(batch_size, seq_length, config['input_size'])
+    
+    # Forward pass
+    output = model(x)
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {output.shape}")