Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 128 additions & 1 deletion src/seu_injection/core/base_injector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
"""

from abc import ABC, abstractmethod
from collections.abc import Callable
from collections.abc import Callable, Generator
from typing import Any, Union

import numpy as np
import torch

from ..bitops import bitflip_float32_optimized


class BaseInjector(ABC):
"""Abstract base class for SEU fault injection in PyTorch models.
Expand Down Expand Up @@ -176,3 +178,128 @@ def _get_criterion_score(self) -> float:
return float(self.criterion(self.model, self.data_loader, device=self.device))
else:
return float(self.criterion(self.model, self.X, self.y, device=self.device))

def _iterate_layers(self, layer_name: Union[str, None]) -> Generator[tuple[str, torch.nn.Parameter], None, None]:
"""Iterate through model layers, optionally filtering by name.

Args:
layer_name: Name of specific layer to target (None for all layers).

Yields:
tuple: (layer_name, parameter_tensor) pairs.

"""
for current_layer_name, tensor in self.model.named_parameters():
# Skip layer if specific layer requested and this isn't it
if layer_name and layer_name != current_layer_name:
continue
yield current_layer_name, tensor

def _prepare_tensor_for_injection(self, tensor: torch.nn.Parameter) -> tuple[torch.Tensor, np.ndarray]:
"""Prepare a tensor for injection by cloning and converting to numpy.

Args:
tensor: The parameter tensor to prepare.

Returns:
tuple: (original_tensor, tensor_cpu) where original_tensor is a clone
and tensor_cpu is a numpy array on CPU.

"""
original_tensor = tensor.data.clone()
tensor_cpu = original_tensor.cpu().numpy()
return original_tensor, tensor_cpu

def _inject_and_evaluate(
self,
tensor: torch.nn.Parameter,
idx: tuple,
original_tensor: torch.Tensor,
original_val: float,
bit_i: int,
) -> tuple[float, float]:
"""Inject a fault at a specific location, evaluate, and restore.

Args:
tensor: The parameter tensor to inject into.
idx: The index location for injection.
original_tensor: The original tensor values for restoration.
original_val: The original value at the injection location.
bit_i: The bit position to flip (0-31).

Returns:
tuple: (criterion_score, seu_val) where criterion_score is the model
performance after injection and seu_val is the injected value.

"""
# Perform bitflip
seu_val = bitflip_float32_optimized(original_val, bit_i, inplace=False)

# Inject fault
tensor.data[idx] = torch.tensor(seu_val, device=self.device, dtype=tensor.dtype)

# Evaluate model
criterion_score = self._get_criterion_score()

# Restore original value
tensor.data[idx] = original_tensor[idx]

return criterion_score, seu_val

def _record_injection_result(
self,
results: dict[str, list[Any]],
idx: tuple,
criterion_score: float,
layer_name: str,
original_val: float,
seu_val: float,
) -> None:
"""Record the results of a single injection.

Args:
results: The results dictionary to update.
idx: The index location of the injection.
criterion_score: The model performance score after injection.
layer_name: The name of the layer that was injected.
original_val: The original parameter value.
seu_val: The value after injection.

"""
results["tensor_location"].append(idx)
results["criterion_score"].append(criterion_score)
results["layer_name"].append(layer_name)
results["value_before"].append(original_val)
results["value_after"].append(seu_val)

def _initialize_results(self) -> dict[str, list[Any]]:
"""Initialize the results dictionary structure.

Returns:
dict: Empty results dictionary with required keys.

"""
return {
"tensor_location": [],
"criterion_score": [],
"layer_name": [],
"value_before": [],
"value_after": [],
}

@abstractmethod
def _get_injection_indices(self, tensor_shape: tuple, **kwargs) -> np.ndarray:
"""Get the indices where injections should be performed.

This method defines the injection strategy (exhaustive vs. stochastic).

Args:
tensor_shape: The shape of the tensor to inject into.
**kwargs: Additional strategy-specific parameters.

Returns:
np.ndarray: Array of indices where injections should occur.
Shape: (N, len(tensor_shape)) where N is the number of injections.

"""
...
85 changes: 35 additions & 50 deletions src/seu_injection/core/exhaustive_seu_injector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import torch
from tqdm import tqdm

from ..bitops import bitflip_float32_optimized
from .base_injector import BaseInjector


Expand All @@ -32,6 +31,21 @@ class ExhaustiveSEUInjector(BaseInjector):

"""

def _get_injection_indices(self, tensor_shape: tuple, **kwargs) -> np.ndarray:
"""Get all indices for exhaustive injection.

Args:
tensor_shape: Shape of the tensor to inject into.
**kwargs: Unused for exhaustive strategy.

Returns:
np.ndarray: All possible indices in the tensor.

"""
# Generate all indices exhaustively
all_indices = list(np.ndindex(tensor_shape))
return np.array(all_indices)

def _run_injector_impl(self, bit_i: int, layer_name: Union[str, None] = None, **kwargs) -> dict[str, list[Any]]:
"""Perform systematic SEU injection across model parameters.

Expand All @@ -54,66 +68,37 @@ def _run_injector_impl(self, bit_i: int, layer_name: Union[str, None] = None, **
- All injections are reversible; model is restored after each run.

"""
results: dict[str, list[Any]] = {
"tensor_location": [],
"criterion_score": [],
"layer_name": [],
"value_before": [],
"value_after": [],
}
results = self._initialize_results()

with torch.no_grad(): # Disable gradient tracking for efficiency
# Iterate through each layer of the neural network
for current_layer_name, tensor in self.model.named_parameters():
# Skip layer if specific layer requested and this isn't it
if layer_name and layer_name != current_layer_name:
continue

for current_layer_name, tensor in self._iterate_layers(layer_name):
print(f"Testing Layer: {current_layer_name}")

# TODO PERFORMANCE: Unnecessary CPU tensor conversion creates memory bottleneck
# PROBLEM: Converting GPU tensors to CPU numpy arrays for bit manipulation
# INEFFICIENCIES:
# - GPU→CPU memory transfer latency (can be 100s of μs per transfer)
# - CPU numpy processing instead of GPU-accelerated operations
# - Memory duplication (original tensor + CPU copy)
# BETTER APPROACH: Keep tensors on GPU, use torch tensor operations for bit manipulation
# IMPACT: Additional overhead on top of already slow bitflip operations

# Store original tensor values for restoration
original_tensor = tensor.data.clone()
tensor_cpu = original_tensor.cpu().numpy() # <-- MEMORY INEFFICIENCY

# ✅ PERFORMANCE CRITICAL FIXED: Replaced slow bitflip_float32() with optimized version
# IMPROVEMENT: Now uses bitflip_float32_optimized() in performance-critical injection loop
# NEW PERFORMANCE:
# - ResNet-18 (11M params): ~1-2 minutes per bit position (30x faster!)
# - ResNet-50 (25M params): ~3-5 minutes per bit position (20x faster!)
# - Each iteration: O(1) bit operation instead of O(32) string manipulation
# CALCULATIONS: 11M params × 3μs per bitflip = ~30 seconds of pure bit operations
# Add model evaluation overhead = 1-2 minutes total
# FUTURE: Could still vectorize entire tensor at once for even better performance

# Iterate through every parameter in the tensor
# Prepare tensor for injection
original_tensor, tensor_cpu = self._prepare_tensor_for_injection(tensor)

# Get indices for injection (exhaustive strategy)
injection_indices = self._get_injection_indices(tensor_cpu.shape, **kwargs)

# Perform injections for all indices
for idx in tqdm(
np.ndindex(tensor_cpu.shape),
injection_indices,
desc=f"Injecting into {current_layer_name}",
):
# Ensure idx is a tuple for consistent indexing
idx = tuple(idx) if not isinstance(idx, tuple) else idx

original_val = tensor_cpu[idx]
seu_val = bitflip_float32_optimized(
original_val, bit_i, inplace=False
) # <-- PERFORMANCE BOTTLENECK FIXED!

# Inject fault, evaluate, restore
tensor.data[idx] = torch.tensor(seu_val, device=self.device, dtype=tensor.dtype)
criterion_score = self._get_criterion_score()
tensor.data[idx] = original_tensor[idx] # Restore original value
# Inject fault, evaluate, and restore
criterion_score, seu_val = self._inject_and_evaluate(
tensor, idx, original_tensor, original_val, bit_i
)

# Record results
results["tensor_location"].append(idx)
results["criterion_score"].append(criterion_score)
results["layer_name"].append(current_layer_name)
results["value_before"].append(original_val)
results["value_after"].append(seu_val)
self._record_injection_result(
results, idx, criterion_score, current_layer_name, original_val, seu_val
)

return results
92 changes: 45 additions & 47 deletions src/seu_injection/core/stochastic_seu_injector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import torch
from tqdm import tqdm

from ..bitops import bitflip_float32_optimized
from .base_injector import BaseInjector


Expand All @@ -32,6 +31,38 @@ class StochasticSEUInjector(BaseInjector):

"""

def _get_injection_indices(self, tensor_shape: tuple, **kwargs) -> np.ndarray:
"""Get stochastically selected indices for injection.

Args:
tensor_shape: Shape of the tensor to inject into.
**kwargs: Must include 'p' (probability) and optionally 'run_at_least_one_injection'.

Returns:
np.ndarray: Randomly selected indices based on probability p.

Raises:
ValueError: If p is not in [0.0, 1.0].

"""
p = kwargs.get("p", 0.0)
run_at_least_one_injection = kwargs.get("run_at_least_one_injection", True)

if not (0.0 <= p <= 1.0):
raise ValueError(f"Probability p must be in [0, 1], got {p}")

# Build a boolean mask for stochastic selection
injection_mask = np.random.random(tensor_shape) < p

# Check if at least one injection will occur
if run_at_least_one_injection and not injection_mask.any() and np.prod(tensor_shape) > 0:
# If no injections selected and we need at least one, pick one randomly
random_idx = tuple(np.random.randint(0, dim) for dim in tensor_shape)
injection_mask[random_idx] = True

# Get indices where injections should occur
return np.argwhere(injection_mask)

def _run_injector_impl(self, bit_i: int, layer_name: Union[str, None] = None, **kwargs) -> dict[str, list[Any]]:
"""Randomly inject faults into model parameters using probability p.

Expand All @@ -54,48 +85,18 @@ def _run_injector_impl(self, bit_i: int, layer_name: Union[str, None] = None, **
- All injections are reversible; model is restored after each run.

"""
p = kwargs.get("p", 0.0)
run_at_least_one_injection = kwargs.get("run_at_least_one_injection", True)
if not (0.0 <= p <= 1.0):
raise ValueError(f"Probability p must be in [0, 1], got {p}")

results: dict[str, list[Any]] = {
"tensor_location": [],
"criterion_score": [],
"layer_name": [],
"value_before": [],
"value_after": [],
}
results = self._initialize_results()

with torch.no_grad(): # Disable gradient tracking for efficiency
# Iterate through each layer of the neural network
for current_layer_name, tensor in self.model.named_parameters():
# Skip layer if specific layer requested and this isn't it
if layer_name and layer_name != current_layer_name:
continue

for current_layer_name, tensor in self._iterate_layers(layer_name):
print(f"Testing Layer: {current_layer_name}")

# Store original tensor values for restoration
original_tensor = tensor.data.clone()
tensor_cpu = original_tensor.cpu().numpy()

# ✅ PERFORMANCE: Now uses optimized bitflip function (major improvement)
# IMPROVEMENT: Stochastic sampling now uses bitflip_float32_optimized()
# PERFORMANCE GAIN: ~30x faster per operation (100μs → 3μs per bitflip)
# NEW: Mask-based approach for better performance and cleaner logic

# Build a boolean mask for stochastic selection
injection_mask = np.random.random(tensor_cpu.shape) < p

# Check if at least one injection will occur
if run_at_least_one_injection and not injection_mask.any() and tensor_cpu.size > 0:
# If no injections selected and we need at least one, pick one randomly
random_idx = tuple(np.random.randint(0, dim) for dim in tensor_cpu.shape)
injection_mask[random_idx] = True
# Prepare tensor for injection
original_tensor, tensor_cpu = self._prepare_tensor_for_injection(tensor)

# Get indices where injections should occur
injection_indices = np.argwhere(injection_mask)
# Get indices for injection (stochastic strategy)
injection_indices = self._get_injection_indices(tensor_cpu.shape, **kwargs)

# Perform injections for selected indices
for idx_array in tqdm(
Expand All @@ -104,18 +105,15 @@ def _run_injector_impl(self, bit_i: int, layer_name: Union[str, None] = None, **
):
idx = tuple(idx_array)
original_val = tensor_cpu[idx]
seu_val = bitflip_float32_optimized(original_val, bit_i, inplace=False)

# Inject fault, evaluate, restore
tensor.data[idx] = torch.tensor(seu_val, device=self.device, dtype=tensor.dtype)
criterion_score = self._get_criterion_score()
tensor.data[idx] = original_tensor[idx] # Restore original value
# Inject fault, evaluate, and restore
criterion_score, seu_val = self._inject_and_evaluate(
tensor, idx, original_tensor, original_val, bit_i
)

# Record results
results["tensor_location"].append(idx)
results["criterion_score"].append(criterion_score)
results["layer_name"].append(current_layer_name)
results["value_before"].append(original_val)
results["value_after"].append(seu_val)
self._record_injection_result(
results, idx, criterion_score, current_layer_name, original_val, seu_val
)

return results
Loading
Loading