diff --git a/.github/workflows/Benchmarks.yml b/.github/workflows/Benchmarks.yml
index 45b782025..ad95313db 100644
--- a/.github/workflows/Benchmarks.yml
+++ b/.github/workflows/Benchmarks.yml
@@ -28,7 +28,7 @@ jobs:
 
       - uses: actions/checkout@v5
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
         env:
diff --git a/.github/workflows/CargoPublish.yml b/.github/workflows/CargoPublish.yml
index a9e77f08e..0c4ad8304 100644
--- a/.github/workflows/CargoPublish.yml
+++ b/.github/workflows/CargoPublish.yml
@@ -32,7 +32,7 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
 
diff --git a/.github/workflows/CreateRelease.yml b/.github/workflows/CreateRelease.yml
index 2adc5069d..07e8e6da2 100644
--- a/.github/workflows/CreateRelease.yml
+++ b/.github/workflows/CreateRelease.yml
@@ -31,7 +31,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
         env:
@@ -52,7 +52,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
         env:
@@ -112,7 +112,7 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
         env:
diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml
index 6abf73e74..e50e7792c 100644
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -26,7 +26,7 @@ jobs:
         with:
           components: rustfmt
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86" 
         env:
diff --git a/.github/workflows/dep_build_guest_binaries.yml b/.github/workflows/dep_build_guest_binaries.yml
index 89ebf4dca..9af15774e 100644
--- a/.github/workflows/dep_build_guest_binaries.yml
+++ b/.github/workflows/dep_build_guest_binaries.yml
@@ -31,7 +31,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
         env:
diff --git a/.github/workflows/dep_fuzzing.yml b/.github/workflows/dep_fuzzing.yml
index 95008efd4..4386cc58a 100644
--- a/.github/workflows/dep_fuzzing.yml
+++ b/.github/workflows/dep_fuzzing.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v5
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86"
         env:
diff --git a/.github/workflows/dep_rust.yml b/.github/workflows/dep_rust.yml
index b93042c50..25777ca43 100644
--- a/.github/workflows/dep_rust.yml
+++ b/.github/workflows/dep_rust.yml
@@ -46,7 +46,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86" 
         env:
@@ -98,7 +98,7 @@ jobs:
         with:
           components: rustfmt
 
-      - uses: hyperlight-dev/ci-setup-workflow@v1.6.0
+      - uses: hyperlight-dev/ci-setup-workflow@v1.8.0
         with:
           rust-toolchain: "1.86" 
         env:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a774369a4..24def0858 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ## [Prerelease] - Unreleased
 
+## [v0.9.1] - 2025-10-29
+
+### Fixed
+
+- Fix race condition when killing sandboxes by @simongdavies in https://github.com/hyperlight-dev/hyperlight/pull/959 and  https://github.com/hyperlight-dev/hyperlight/pull/994
+
+### Changed
+
+- Added poison sandbox detection  by @ludfjig in https://github.com/hyperlight-dev/hyperlight/pull/931
+
 ## [v0.9.0] - 2025-08-28
 
 ### Fixed
@@ -188,7 +198,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 The Initial Hyperlight Release 🎉 
 
 
-[Prerelease]: <https://github.com/hyperlight-dev/hyperlight/compare/v0.9.0..HEAD>
+[Prerelease]: <https://github.com/hyperlight-dev/hyperlight/compare/v0.9.1..HEAD>
+[v0.9.1]: <https://github.com/hyperlight-dev/hyperlight/compare/v0.8.0...v0.9.1>
 [v0.9.0]: <https://github.com/hyperlight-dev/hyperlight/compare/v0.8.0...v0.9.0>
 [v0.8.0]: <https://github.com/hyperlight-dev/hyperlight/compare/v0.7.0...v0.8.0>
 [v0.7.0]: <https://github.com/hyperlight-dev/hyperlight/compare/v0.6.1...v0.7.0>
diff --git a/Cargo.lock b/Cargo.lock
index 9770b25e7..7049437ba 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1344,7 +1344,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-common"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "anyhow",
  "arbitrary",
@@ -1357,7 +1357,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-component-macro"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "env_logger",
  "hyperlight-component-util",
@@ -1371,7 +1371,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-component-util"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "itertools 0.14.0",
  "log",
@@ -1393,7 +1393,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-guest"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "anyhow",
  "flatbuffers",
@@ -1404,7 +1404,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-guest-bin"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "buddy_system_allocator",
  "cc",
@@ -1419,7 +1419,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-guest-tracing"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "hyperlight-common",
  "hyperlight-guest-tracing-macro",
@@ -1428,7 +1428,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-guest-tracing-macro"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1437,7 +1437,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight-host"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "anyhow",
  "bitflags 2.9.3",
@@ -1527,7 +1527,7 @@ dependencies = [
 
 [[package]]
 name = "hyperlight_guest_capi"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "cbindgen",
  "hyperlight-common",
diff --git a/Cargo.toml b/Cargo.toml
index d5f9a4092..c714bc62d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,7 +28,7 @@ exclude = [
 ]
 
 [workspace.package]
-version = "0.9.0"
+version = "0.9.1"
 edition = "2024"
 rust-version = "1.86"
 license = "Apache-2.0"
diff --git a/src/hyperlight_host/src/error.rs b/src/hyperlight_host/src/error.rs
index d2fb382a0..1192737e4 100644
--- a/src/hyperlight_host/src/error.rs
+++ b/src/hyperlight_host/src/error.rs
@@ -196,6 +196,27 @@ pub enum HyperlightError {
     #[error("Failure processing PE File {0:?}")]
     PEFileProcessingFailure(#[from] goblin::error::Error),
 
+    /// The sandbox becomes **poisoned** when the guest is not run to completion, leaving it in
+    /// an inconsistent state that could compromise memory safety, data integrity, or security.
+    ///
+    /// ### When Does Poisoning Occur?
+    ///
+    /// Poisoning happens when guest execution is interrupted before normal completion:
+    ///
+    /// - **Guest panics or aborts** - When a guest function panics, crashes, or calls `abort()`,
+    ///   the normal cleanup and unwinding process is interrupted
+    /// - **Invalid memory access** - Attempts to read/write/execute memory outside allowed regions
+    /// - **Stack overflow** - Guest exhausts its stack space during execution
+    /// - **Heap exhaustion** - Guest runs out of heap memory
+    /// - **Host-initiated cancellation** - Calling [`InterruptHandle::kill()`] to forcefully
+    ///   terminate an in-progress guest function
+    ///
+    /// ## Recovery
+    ///
+    /// Use [`crate::MultiUseSandbox::restore()`] to recover from a poisoned sandbox.
+    #[error("The sandbox was poisoned")]
+    PoisonedSandbox,
+
     /// Raw pointer is less than base address
     #[error("Raw pointer ({0:?}) was less than the base address ({1})")]
     RawPointerLessThanBaseAddress(RawPtr, u64),
@@ -301,6 +322,96 @@ impl<T> From<PoisonError<MutexGuard<'_, T>>> for HyperlightError {
     }
 }
 
+impl HyperlightError {
+    /// Internal helper to determines if the given error has potential to poison the sandbox.
+    ///
+    /// Errors that poison the sandbox are those that can leave the sandbox in an inconsistent
+    /// state where memory, resources, or data structures may be corrupted or leaked. Usually
+    /// due to the guest not running to completion.
+    ///
+    /// If this method returns `true`, the sandbox will be poisoned and all further operations
+    /// will fail until the sandbox is restored from a non-poisoned snapshot using
+    /// [`crate::MultiUseSandbox::restore()`].
+    pub(crate) fn is_poison_error(&self) -> bool {
+        // wildcard _ or matches! not used here purposefully to ensure that new error variants
+        // are explicitly considered for poisoning behavior.
+        match self {
+            // These errors poison the sandbox because they can leave it in an inconsistent state due
+            // to the guest not running to completion.
+            HyperlightError::GuestAborted(_, _)
+            | HyperlightError::ExecutionCanceledByHost()
+            | HyperlightError::PoisonedSandbox
+            | HyperlightError::ExecutionAccessViolation(_)
+            | HyperlightError::StackOverflow()
+            | HyperlightError::MemoryAccessViolation(_, _, _) => true,
+            #[cfg(all(feature = "seccomp", target_os = "linux"))]
+            HyperlightError::DisallowedSyscall => true,
+
+            // All other errors do not poison the sandbox.
+            HyperlightError::AnyhowError(_)
+            | HyperlightError::BoundsCheckFailed(_, _)
+            | HyperlightError::CheckedAddOverflow(_, _)
+            | HyperlightError::CStringConversionError(_)
+            | HyperlightError::Error(_)
+            | HyperlightError::FailedToGetValueFromParameter()
+            | HyperlightError::FieldIsMissingInGuestLogData(_)
+            | HyperlightError::GuestError(_, _)
+            | HyperlightError::GuestExecutionHungOnHostFunctionCall()
+            | HyperlightError::GuestFunctionCallAlreadyInProgress()
+            | HyperlightError::GuestInterfaceUnsupportedType(_)
+            | HyperlightError::GuestOffsetIsInvalid(_)
+            | HyperlightError::HostFunctionNotFound(_)
+            | HyperlightError::IOError(_)
+            | HyperlightError::IntConversionFailure(_)
+            | HyperlightError::InvalidFlatBuffer(_)
+            | HyperlightError::JsonConversionFailure(_)
+            | HyperlightError::LockAttemptFailed(_)
+            | HyperlightError::MemoryAllocationFailed(_)
+            | HyperlightError::MemoryProtectionFailed(_)
+            | HyperlightError::MemoryRequestTooBig(_, _)
+            | HyperlightError::MetricNotFound(_)
+            | HyperlightError::MmapFailed(_)
+            | HyperlightError::MprotectFailed(_)
+            | HyperlightError::NoHypervisorFound()
+            | HyperlightError::NoMemorySnapshot
+            | HyperlightError::ParameterValueConversionFailure(_, _)
+            | HyperlightError::PEFileProcessingFailure(_)
+            | HyperlightError::RawPointerLessThanBaseAddress(_, _)
+            | HyperlightError::RefCellBorrowFailed(_)
+            | HyperlightError::RefCellMutBorrowFailed(_)
+            | HyperlightError::ReturnValueConversionFailure(_, _)
+            | HyperlightError::SnapshotSandboxMismatch
+            | HyperlightError::SystemTimeError(_)
+            | HyperlightError::TryFromSliceError(_)
+            | HyperlightError::UnexpectedNoOfArguments(_, _)
+            | HyperlightError::UnexpectedParameterValueType(_, _)
+            | HyperlightError::UnexpectedReturnValueType(_, _)
+            | HyperlightError::UTF8StringConversionFailure(_)
+            | HyperlightError::VectorCapacityIncorrect(_, _, _) => false,
+
+            #[cfg(target_os = "windows")]
+            HyperlightError::CrossBeamReceiveError(_) => false,
+            #[cfg(target_os = "windows")]
+            HyperlightError::CrossBeamSendError(_) => false,
+            #[cfg(target_os = "windows")]
+            HyperlightError::WindowsAPIError(_) => false,
+            #[cfg(target_os = "linux")]
+            HyperlightError::VmmSysError(_) => false,
+            #[cfg(kvm)]
+            HyperlightError::KVMError(_) => false,
+            #[cfg(mshv)]
+            HyperlightError::MSHVError(_) => false,
+            #[cfg(gdb)]
+            HyperlightError::TranslateGuestAddress(_) => false,
+            #[cfg(all(feature = "seccomp", target_os = "linux"))]
+            HyperlightError::SeccompFilterError(_) => false,
+
+            #[cfg(all(feature = "seccomp", target_os = "linux"))]
+            HyperlightError::SeccompFilterBackendError(_) => false,
+        }
+    }
+}
+
 /// Creates a `HyperlightError::Error` from a string literal or format string
 #[macro_export]
 macro_rules! new_error {
diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs
index 1907d8e0b..075a53385 100644
--- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs
+++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs
@@ -70,7 +70,7 @@ use super::{
     CR0_AM, CR0_ET, CR0_MP, CR0_NE, CR0_PE, CR0_PG, CR0_WP, CR4_OSFXSR, CR4_OSXMMEXCPT, CR4_PAE,
     EFER_LMA, EFER_LME, EFER_NX, EFER_SCE,
 };
-use super::{HyperlightExit, Hypervisor, InterruptHandle, LinuxInterruptHandle, VirtualCPU};
+use super::{HyperlightExit, Hypervisor, LinuxInterruptHandle, VirtualCPU};
 #[cfg(gdb)]
 use crate::HyperlightError;
 use crate::hypervisor::get_memory_access_violation;
@@ -419,7 +419,8 @@ impl HypervLinuxDriver {
 
         let interrupt_handle = Arc::new(LinuxInterruptHandle {
             running: AtomicU64::new(0),
-            cancel_requested: AtomicBool::new(false),
+            cancel_requested: AtomicU64::new(0),
+            call_active: AtomicBool::new(false),
             #[cfg(gdb)]
             debug_interrupt: AtomicBool::new(false),
             #[cfg(all(
@@ -761,17 +762,18 @@ impl Hypervisor for HypervLinuxDriver {
 
         self.interrupt_handle
             .tid
-            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        self.interrupt_handle
-            .set_running_and_increment_generation()
-            .map_err(|e| {
-                new_error!(
-                    "Error setting running state and incrementing generation: {}",
-                    e
-                )
-            })?;
+            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // Cast to internal trait for access to internal methods
+        let interrupt_handle_internal =
+            self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal;
+
+        // (after set_running_bit but before checking cancel_requested):
+        // - kill() will stamp cancel_requested with the current generation
+        // - We will check cancel_requested below and skip the VcpuFd::run() call
+        // - This is the desired behavior - the kill takes effect immediately
+        let generation = interrupt_handle_internal.set_running_bit();
+
         #[cfg(not(gdb))]
         let debug_interrupt = false;
         #[cfg(gdb)]
@@ -780,14 +782,15 @@ impl Hypervisor for HypervLinuxDriver {
             .debug_interrupt
             .load(Ordering::Relaxed);
 
-        // Don't run the vcpu if `cancel_requested` is true
+        // Don't run the vcpu if `cancel_requested` is set for our generation
         //
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        let exit_reason = if self
-            .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed)
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after checking cancel_requested but before vcpu.run()):
+        // - kill() will stamp cancel_requested with the current generation
+        // - We will proceed with vcpu.run(), but signals will be sent to interrupt it
+        // - The vcpu will be interrupted and return EINTR (handled below)
+        let exit_reason = if interrupt_handle_internal
+            .is_cancel_requested_for_generation(generation)
             || debug_interrupt
         {
             Err(mshv_ioctls::MshvError::from(libc::EINTR))
@@ -813,27 +816,31 @@ impl Hypervisor for HypervLinuxDriver {
             #[cfg(mshv3)]
             self.vcpu_fd.run()
         };
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then signals will be sent to this thread until `running` is set to false.
-        // This is fine since the signal handler is a no-op.
-        let cancel_requested = self
-            .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed);
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after vcpu.run() returns but before clear_running_bit):
+        // - kill() continues sending signals to this thread (running bit is still set)
+        // - The signals are harmless (no-op handler), we just need to check cancel_requested
+        // - We load cancel_requested below to determine if this run was cancelled
+        let cancel_requested =
+            interrupt_handle_internal.is_cancel_requested_for_generation(generation);
         #[cfg(gdb)]
         let debug_interrupt = self
             .interrupt_handle
             .debug_interrupt
             .load(Ordering::Relaxed);
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
-        // Additionally signals will be sent to this thread until `running` is set to false.
-        // This is fine since the signal handler is a no-op.
-        self.interrupt_handle.clear_running_bit();
-        // At this point, `running` is false so no more signals will be sent to this thread,
-        // but we may still receive async signals that were sent before this point.
-        // To prevent those signals from interrupting subsequent calls to `run()`,
-        // we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after loading cancel_requested but before clear_running_bit):
+        // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded)
+        // - kill() continues sending signals until running bit is cleared
+        // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call
+        // - Signals sent now are harmless (no-op handler)
+        interrupt_handle_internal.clear_running_bit();
+        // At this point, running bit is clear so kill() will stop sending signals.
+        // However, we may still receive delayed signals that were sent before clear_running_bit.
+        // These stale signals are harmless because:
+        // - The signal handler is a no-op
+        // - We check generation matching in cancel_requested before treating EINTR as cancellation
+        // - If generation doesn't match, we return Retry instead of Cancelled
         let result = match exit_reason {
             Ok(m) => match m.header.message_type {
                 HALT_MESSAGE => {
@@ -913,14 +920,16 @@ impl Hypervisor for HypervLinuxDriver {
                 }
             },
             Err(e) => match e.errno() {
-                // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled
+                // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR
                 libc::EINTR => {
-                    // If cancellation was not requested for this specific vm, the vcpu was interrupted because of debug interrupt or
-                    // a stale signal that meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
+                    // Check if cancellation was requested for THIS specific generation.
+                    // If not, the EINTR came from:
+                    // - A debug interrupt (if GDB is enabled)
+                    // - A stale signal from a previous guest call (generation mismatch)
+                    // - A signal meant for a different sandbox on the same thread
+                    // In these cases, we return Retry to continue execution.
                     if cancel_requested {
-                        self.interrupt_handle
-                            .cancel_requested
-                            .store(false, Ordering::Relaxed);
+                        interrupt_handle_internal.clear_cancel_requested();
                         HyperlightExit::Cancelled()
                     } else {
                         #[cfg(gdb)]
@@ -956,7 +965,7 @@ impl Hypervisor for HypervLinuxDriver {
         self as &mut dyn Hypervisor
     }
 
-    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
+    fn interrupt_handle(&self) -> Arc<dyn super::InterruptHandleInternal> {
         self.interrupt_handle.clone()
     }
 
diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs
index f8b4727d1..05e7b65e7 100644
--- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs
+++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs
@@ -17,7 +17,7 @@ limitations under the License.
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::string::String;
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 
 use log::LevelFilter;
@@ -55,8 +55,8 @@ use super::{
 };
 use super::{HyperlightExit, Hypervisor, InterruptHandle, VirtualCPU};
 use crate::hypervisor::fpu::FP_CONTROL_WORD_DEFAULT;
-use crate::hypervisor::get_memory_access_violation;
 use crate::hypervisor::wrappers::WHvGeneralRegisters;
+use crate::hypervisor::{InterruptHandleInternal, get_memory_access_violation};
 use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
 use crate::mem::ptr::{GuestPtr, RawPtr};
 use crate::mem::shared_mem::HostSharedMemory;
@@ -344,10 +344,11 @@ impl HypervWindowsDriver {
         };
 
         let interrupt_handle = Arc::new(WindowsInterruptHandle {
-            running: AtomicBool::new(false),
-            cancel_requested: AtomicBool::new(false),
+            running: AtomicU64::new(0),
+            cancel_requested: AtomicU64::new(0),
             #[cfg(gdb)]
             debug_interrupt: AtomicBool::new(false),
+            call_active: AtomicBool::new(false),
             partition_handle,
             dropped: AtomicBool::new(false),
         });
@@ -736,7 +737,12 @@ impl Hypervisor for HypervWindowsDriver {
 
     #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
     fn run(&mut self) -> Result<super::HyperlightExit> {
-        self.interrupt_handle.running.store(true, Ordering::Relaxed);
+        // Cast to internal trait for access to internal methods
+        let interrupt_handle_internal =
+            self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal;
+
+        // Get current generation and set running bit
+        let generation = interrupt_handle_internal.set_running_bit();
 
         #[cfg(not(gdb))]
         let debug_interrupt = false;
@@ -746,11 +752,9 @@ impl Hypervisor for HypervWindowsDriver {
             .debug_interrupt
             .load(Ordering::Relaxed);
 
-        // Don't run the vcpu if `cancel_requested` is true
-        let exit_context = if self
-            .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed)
+        // Check if cancellation was requested for THIS generation
+        let exit_context = if interrupt_handle_internal
+            .is_cancel_requested_for_generation(generation)
             || debug_interrupt
         {
             WHV_RUN_VP_EXIT_CONTEXT {
@@ -770,12 +774,20 @@ impl Hypervisor for HypervWindowsDriver {
             }
             self.processor.run()?
         };
-        self.interrupt_handle
-            .cancel_requested
-            .store(false, Ordering::Relaxed);
-        self.interrupt_handle
-            .running
-            .store(false, Ordering::Relaxed);
+
+        // Clear running bit
+        interrupt_handle_internal.clear_running_bit();
+
+        let is_canceled = exit_context.ExitReason == WHV_RUN_VP_EXIT_REASON(8193i32); // WHvRunVpExitReasonCanceled
+
+        // Check if this was a manual cancellation (vs internal Windows cancellation)
+        let cancel_was_requested_manually =
+            interrupt_handle_internal.is_cancel_requested_for_generation(generation);
+
+        // Only clear cancel_requested if we're actually processing a cancellation for this generation
+        if is_canceled && cancel_was_requested_manually {
+            interrupt_handle_internal.clear_cancel_requested();
+        }
 
         #[cfg(gdb)]
         let debug_interrupt = self
@@ -851,12 +863,32 @@ impl Hypervisor for HypervWindowsDriver {
                     // return a special exit reason so that the gdb thread can handle it
                     // and resume execution
                     HyperlightExit::Debug(VcpuStopReason::Interrupt)
+                } else if !cancel_was_requested_manually {
+                    // This was an internal cancellation
+                    // The virtualization stack can use this function to return the control
+                    // of a virtual processor back to the virtualization stack in case it
+                    // needs to change the state of a VM or to inject an event into the processor
+                    // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
+                    debug!("Internal cancellation detected, returning Retry error");
+                    HyperlightExit::Retry()
                 } else {
                     HyperlightExit::Cancelled()
                 }
 
                 #[cfg(not(gdb))]
-                HyperlightExit::Cancelled()
+                {
+                    if !cancel_was_requested_manually {
+                        // This was an internal cancellation
+                        // The virtualization stack can use this function to return the control
+                        // of a virtual processor back to the virtualization stack in case it
+                        // needs to change the state of a VM or to inject an event into the processor
+                        // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
+                        debug!("Internal cancellation detected, returning Retry error");
+                        HyperlightExit::Retry()
+                    } else {
+                        HyperlightExit::Cancelled()
+                    }
+                }
             }
             #[cfg(gdb)]
             WHV_RUN_VP_EXIT_REASON(4098i32) => {
@@ -885,7 +917,7 @@ impl Hypervisor for HypervWindowsDriver {
         Ok(result)
     }
 
-    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
+    fn interrupt_handle(&self) -> Arc<dyn super::InterruptHandleInternal> {
         self.interrupt_handle.clone()
     }
 
@@ -1123,30 +1155,80 @@ impl Drop for HypervWindowsDriver {
 
 #[derive(Debug)]
 pub struct WindowsInterruptHandle {
-    // `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running, which is the reason we need this flag.
-    running: AtomicBool,
-    cancel_requested: AtomicBool,
+    /// Combined running flag (bit 63) and generation counter (bits 0-62).
+    ///
+    /// The generation increments with each guest function call to prevent
+    /// stale cancellations from affecting new calls (ABA problem).
+    ///
+    /// Layout: `[running:1 bit][generation:63 bits]`
+    running: AtomicU64,
+
+    /// Combined cancel_requested flag (bit 63) and generation counter (bits 0-62).
+    ///
+    /// When kill() is called, this stores the current generation along with
+    /// the cancellation flag. The VCPU only honors the cancellation if the
+    /// generation matches its current generation.
+    ///
+    /// Layout: `[cancel_requested:1 bit][generation:63 bits]`
+    cancel_requested: AtomicU64,
+
     // This is used to signal the GDB thread to stop the vCPU
     #[cfg(gdb)]
     debug_interrupt: AtomicBool,
+    /// Flag indicating whether a guest function call is currently in progress.
+    ///
+    /// **true**: A guest function call is active (between call start and completion)
+    /// **false**: No guest function call is active
+    ///
+    /// # Purpose
+    ///
+    /// This flag prevents kill() from having any effect when called outside of a
+    /// guest function call. This solves the "kill-in-advance" problem where kill()
+    /// could be called before a guest function starts and would incorrectly cancel it.
+    call_active: AtomicBool,
     partition_handle: WHV_PARTITION_HANDLE,
     dropped: AtomicBool,
 }
 
 impl InterruptHandle for WindowsInterruptHandle {
     fn kill(&self) -> bool {
-        self.cancel_requested.store(true, Ordering::Relaxed);
-        self.running.load(Ordering::Relaxed)
-            && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
+        // Check if a call is actually active first
+        if !self.call_active.load(Ordering::Acquire) {
+            return false;
+        }
+
+        // Get the current running state and generation
+        let (running, generation) = self.get_running_and_generation();
+
+        // Set cancel_requested with the current generation
+        self.set_cancel_requested(generation);
+
+        // Only call WHvCancelRunVirtualProcessor if VCPU is actually running in guest mode
+        running && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
     }
+
     #[cfg(gdb)]
     fn kill_from_debugger(&self) -> bool {
         self.debug_interrupt.store(true, Ordering::Relaxed);
-        self.running.load(Ordering::Relaxed)
-            && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
+        let (running, _) = self.get_running_and_generation();
+        running && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
     }
 
     fn dropped(&self) -> bool {
         self.dropped.load(Ordering::Relaxed)
     }
 }
+
+impl InterruptHandleInternal for WindowsInterruptHandle {
+    fn get_call_active(&self) -> &AtomicBool {
+        &self.call_active
+    }
+
+    fn get_running(&self) -> &AtomicU64 {
+        &self.running
+    }
+
+    fn get_cancel_requested(&self) -> &AtomicU64 {
+        &self.cancel_requested
+    }
+}
diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs
index be76129fb..75b6841b8 100644
--- a/src/hyperlight_host/src/hypervisor/kvm.rs
+++ b/src/hyperlight_host/src/hypervisor/kvm.rs
@@ -37,7 +37,7 @@ use super::{
     CR0_AM, CR0_ET, CR0_MP, CR0_NE, CR0_PE, CR0_PG, CR0_WP, CR4_OSFXSR, CR4_OSXMMEXCPT, CR4_PAE,
     EFER_LMA, EFER_LME, EFER_NX, EFER_SCE,
 };
-use super::{HyperlightExit, Hypervisor, InterruptHandle, LinuxInterruptHandle, VirtualCPU};
+use super::{HyperlightExit, Hypervisor, LinuxInterruptHandle, VirtualCPU};
 #[cfg(gdb)]
 use crate::HyperlightError;
 use crate::hypervisor::get_memory_access_violation;
@@ -356,7 +356,8 @@ impl KVMDriver {
 
         let interrupt_handle = Arc::new(LinuxInterruptHandle {
             running: AtomicU64::new(0),
-            cancel_requested: AtomicBool::new(false),
+            cancel_requested: AtomicU64::new(0),
+            call_active: AtomicBool::new(false),
             #[cfg(gdb)]
             debug_interrupt: AtomicBool::new(false),
             #[cfg(all(
@@ -664,17 +665,18 @@ impl Hypervisor for KVMDriver {
     fn run(&mut self) -> Result<HyperlightExit> {
         self.interrupt_handle
             .tid
-            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        self.interrupt_handle
-            .set_running_and_increment_generation()
-            .map_err(|e| {
-                new_error!(
-                    "Error setting running state and incrementing generation: {}",
-                    e
-                )
-            })?;
+            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // Cast to internal trait for access to internal methods
+        let interrupt_handle_internal =
+            self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal;
+
+        // (after set_running_bit but before checking cancel_requested):
+        // - kill() will stamp cancel_requested with the current generation
+        // - We will check cancel_requested below and skip the VcpuFd::run() call
+        // - This is the desired behavior - the kill takes effect immediately
+        let generation = interrupt_handle_internal.set_running_bit();
+
         #[cfg(not(gdb))]
         let debug_interrupt = false;
         #[cfg(gdb)]
@@ -682,14 +684,15 @@ impl Hypervisor for KVMDriver {
             .interrupt_handle
             .debug_interrupt
             .load(Ordering::Relaxed);
-        // Don't run the vcpu if `cancel_requested` is true
+        // Don't run the vcpu if `cancel_requested` is set for our generation
         //
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        let exit_reason = if self
-            .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed)
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after checking cancel_requested but before vcpu.run()):
+        // - kill() will stamp cancel_requested with the current generation
+        // - We will proceed with vcpu.run(), but signals will be sent to interrupt it
+        // - The vcpu will be interrupted and return EINTR (handled below)
+        let exit_reason = if interrupt_handle_internal
+            .is_cancel_requested_for_generation(generation)
             || debug_interrupt
         {
             Err(kvm_ioctls::Error::new(libc::EINTR))
@@ -703,34 +706,39 @@ impl Hypervisor for KVMDriver {
                     Some(hyperlight_guest_tracing::invariant_tsc::read_tsc());
             }
 
-            // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-            // Then the vcpu will run, but we will keep sending signals to this thread
-            // to interrupt it until `running` is set to false. The `vcpu_fd::run()` call will
-            // return either normally with an exit reason, or from being "kicked" by out signal handler, with an EINTR error,
-            // both of which are fine.
+            // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+            // (during vcpu.run() execution):
+            // - kill() stamps cancel_requested with the current generation
+            // - kill() sends signals (SIGRTMIN+offset) to this thread repeatedly
+            // - The signal handler is a no-op, but it causes vcpu.run() to return EINTR
+            // - We check cancel_requested below and return Cancelled if generation matches
             self.vcpu_fd.run()
         };
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then signals will be sent to this thread until `running` is set to false.
-        // This is fine since the signal handler is a no-op.
-        let cancel_requested = self
-            .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed);
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after vcpu.run() returns but before clear_running_bit):
+        // - kill() continues sending signals to this thread (running bit is still set)
+        // - The signals are harmless (no-op handler), we just need to check cancel_requested
+        // - We load cancel_requested below to determine if this run was cancelled
+        let cancel_requested =
+            interrupt_handle_internal.is_cancel_requested_for_generation(generation);
         #[cfg(gdb)]
         let debug_interrupt = self
             .interrupt_handle
             .debug_interrupt
             .load(Ordering::Relaxed);
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
-        // Additionally signals will be sent to this thread until `running` is set to false.
-        // This is fine since the signal handler is a no-op.
-        self.interrupt_handle.clear_running_bit();
-        // At this point, `running` is false so no more signals will be sent to this thread,
-        // but we may still receive async signals that were sent before this point.
-        // To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
-        // we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after loading cancel_requested but before clear_running_bit):
+        // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded)
+        // - kill() continues sending signals until running bit is cleared
+        // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call
+        // - Signals sent now are harmless (no-op handler)
+        interrupt_handle_internal.clear_running_bit();
+        // At this point, running bit is clear so kill() will stop sending signals.
+        // However, we may still receive delayed signals that were sent before clear_running_bit.
+        // These stale signals are harmless because:
+        // - The signal handler is a no-op
+        // - We check generation matching in cancel_requested before treating EINTR as cancellation
+        // - If generation doesn't match, we return Retry instead of Cancelled
         let result = match exit_reason {
             Ok(VcpuExit::Hlt) => {
                 crate::debug!("KVM - Halt Details : {:#?}", &self);
@@ -779,14 +787,16 @@ impl Hypervisor for KVMDriver {
                 }
             },
             Err(e) => match e.errno() {
-                // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled
+                // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR
                 libc::EINTR => {
-                    // If cancellation was not requested for this specific vm, the vcpu was interrupted because of debug interrupt or
-                    // a stale signal that meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
+                    // Check if cancellation was requested for THIS specific generation.
+                    // If not, the EINTR came from:
+                    // - A debug interrupt (if GDB is enabled)
+                    // - A stale signal from a previous guest call (generation mismatch)
+                    // - A signal meant for a different sandbox on the same thread
+                    // In these cases, we return Retry to continue execution.
                     if cancel_requested {
-                        self.interrupt_handle
-                            .cancel_requested
-                            .store(false, Ordering::Relaxed);
+                        interrupt_handle_internal.clear_cancel_requested();
                         HyperlightExit::Cancelled()
                     } else {
                         #[cfg(gdb)]
@@ -827,7 +837,7 @@ impl Hypervisor for KVMDriver {
         self as &mut dyn Hypervisor
     }
 
-    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
+    fn interrupt_handle(&self) -> Arc<dyn super::InterruptHandleInternal> {
         self.interrupt_handle.clone()
     }
 
diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs
index 4e55e7ac5..7969bb44a 100644
--- a/src/hyperlight_host/src/hypervisor/mod.rs
+++ b/src/hyperlight_host/src/hypervisor/mod.rs
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-use log::LevelFilter;
+use log::{LevelFilter, debug};
 use tracing::{Span, instrument};
 
 use crate::HyperlightError::StackOverflow;
@@ -61,7 +61,6 @@ pub(crate) mod crashdump;
 
 use std::fmt::Debug;
 use std::str::FromStr;
-#[cfg(any(kvm, mshv))]
 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 #[cfg(any(kvm, mshv))]
@@ -112,7 +111,9 @@ pub enum HyperlightExit {
     Cancelled(),
     /// The vCPU has exited for a reason that is not handled by Hyperlight
     Unknown(String),
-    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
+    /// The operation should be retried
+    /// On Linux this can happen where a call to run the CPU can return EAGAIN
+    /// On Windows the platform could cause a cancelation of the VM run
     Retry(),
 }
 
@@ -186,8 +187,8 @@ pub(crate) trait Hypervisor: Debug + Send {
     /// Run the vCPU
     fn run(&mut self) -> Result<HyperlightExit>;
 
-    /// Get InterruptHandle to underlying VM
-    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle>;
+    /// Get InterruptHandle to underlying VM (returns internal trait)
+    fn interrupt_handle(&self) -> Arc<dyn InterruptHandleInternal>;
 
     /// Get the logging level to pass to the guest entrypoint
     fn get_max_log_level(&self) -> u32 {
@@ -350,7 +351,10 @@ impl VirtualCPU {
 
                     log_then_return!("Unexpected VM Exit {:?}", reason);
                 }
-                Ok(HyperlightExit::Retry()) => continue,
+                Ok(HyperlightExit::Retry()) => {
+                    debug!("[VCPU] Retry - continuing VM run loop");
+                    continue;
+                }
                 Err(e) => {
                     #[cfg(crashdump)]
                     crashdump::generate_crashdump(hv)?;
@@ -368,17 +372,54 @@ impl VirtualCPU {
     }
 }
 
-/// A trait for handling interrupts to a sandbox's vcpu
+/// A trait for handling interrupts to a sandbox's vcpu (public API)
 pub trait InterruptHandle: Debug + Send + Sync {
     /// Interrupt the corresponding sandbox from running.
     ///
-    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
-    /// - If this is called while the vcpu is not running, (for example during a host call), the
-    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
-    ///   it's scheduled, and returns `false`.
+    /// This method attempts to cancel a currently executing guest function call by sending
+    /// a signal to the VCPU thread. It uses generation tracking and call_active flag to
+    /// ensure the interruption is safe and precise.
+    ///
+    /// # Behavior
+    ///
+    /// - **Guest function running**: If called while a guest function is executing (VCPU running
+    ///   or in a host function call), this stamps the current generation into cancel_requested
+    ///   and sends a signal to interrupt the VCPU. Returns `true`.
+    ///
+    /// - **No active call**: If called when no guest function call is in progress (call_active=false),
+    ///   this has no effect and returns `false`. This prevents "kill-in-advance" where kill()
+    ///   is called before a guest function starts.
+    ///
+    /// - **During host function**: If the guest call is currently executing a host function
+    ///   (VCPU not running but call_active=true), this stamps cancel_requested. When the
+    ///   host function returns and attempts to re-enter the guest, the cancellation will
+    ///   be detected and the call will abort. Returns `true`.
+    ///
+    /// # Generation Tracking
+    ///
+    /// The method stamps the current generation number along with the cancellation request.
+    /// This ensures that:
+    /// - Stale signals from previous calls are ignored (generation mismatch)
+    /// - Only the intended guest function call is affected
+    /// - Multiple rapid kill() calls on the same generation are idempotent
+    ///
+    /// # Blocking Behavior
+    ///
+    /// This function will block while attempting to deliver the signal to the VCPU thread,
+    /// retrying until either:
+    /// - The signal is successfully delivered (VCPU transitions from running to not running)
+    /// - The VCPU stops running for another reason (e.g., call completes normally)
+    ///
+    /// # Returns
+    ///
+    /// - `true`: Cancellation request was stamped (kill will take effect)
+    /// - `false`: No active call, cancellation request was not stamped (no effect)
     ///
     /// # Note
-    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
+    ///
+    /// To reliably interrupt a guest call, ensure `kill()` is called while the guest
+    /// function is actually executing. Calling kill() before call_guest_function() will
+    /// have no effect.
     fn kill(&self) -> bool;
 
     /// Used by a debugger to interrupt the corresponding sandbox from running.
@@ -393,94 +434,309 @@ pub trait InterruptHandle: Debug + Send + Sync {
     #[cfg(gdb)]
     fn kill_from_debugger(&self) -> bool;
 
-    /// Returns true if the corresponding sandbox has been dropped
+    /// Check if the corresponding VM has been dropped.
     fn dropped(&self) -> bool;
 }
 
+/// Internal trait for interrupt handle implementation details (private, cross-platform).
+///
+/// This trait contains all the internal atomics access methods and helper functions
+/// that are shared between Linux and Windows implementations. It extends InterruptHandle
+/// to inherit the public API.
+///
+/// This trait should NOT be used outside of hypervisor implementations.
+pub(crate) trait InterruptHandleInternal: InterruptHandle {
+    /// Returns the call_active atomic bool reference for internal implementations.
+    fn get_call_active(&self) -> &AtomicBool;
+
+    /// Returns the running atomic u64 reference for internal implementations.
+    fn get_running(&self) -> &AtomicU64;
+
+    /// Returns the cancel_requested atomic u64 reference for internal implementations.
+    fn get_cancel_requested(&self) -> &AtomicU64;
+
+    /// Set call_active - increments generation and sets flag.
+    ///
+    /// Increments the generation counter and sets the call_active flag to true,
+    /// indicating that a guest function call is now in progress. This allows
+    /// kill() to stamp cancel_requested with the correct generation.
+    ///
+    /// Must be called at the start of call_guest_function_by_name_no_reset(),
+    /// before any VCPU execution begins.
+    ///
+    /// Returns true if call_active was already set (indicating a guard already exists),
+    /// false otherwise.
+    fn set_call_active(&self) -> bool {
+        self.increment_generation();
+        self.get_call_active().swap(true, Ordering::AcqRel)
+    }
+
+    /// Clear call_active - clears the call_active flag.
+    ///
+    /// Clears the call_active flag, indicating that no guest function call is
+    /// in progress. After this, kill() will have no effect and will return false.
+    ///
+    /// Must be called at the end of call_guest_function_by_name_no_reset(),
+    /// after the guest call has fully completed (whether successfully or with error).
+    fn clear_call_active(&self) {
+        self.get_call_active().store(false, Ordering::Release)
+    }
+
+    /// Set cancel_requested to true with the given generation.
+    ///
+    /// This stamps the cancellation request with the current generation number,
+    /// ensuring that only the VCPU running with this exact generation will honor
+    /// the cancellation.
+    fn set_cancel_requested(&self, generation: u64) {
+        const CANCEL_REQUESTED_BIT: u64 = 1 << 63;
+        const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1;
+        let value = CANCEL_REQUESTED_BIT | (generation & MAX_GENERATION);
+        self.get_cancel_requested().store(value, Ordering::Release);
+    }
+
+    /// Clear cancel_requested (reset to no cancellation).
+    ///
+    /// This is called after a cancellation has been processed to reset the
+    /// cancellation flag for the next guest call.
+    fn clear_cancel_requested(&self) {
+        self.get_cancel_requested().store(0, Ordering::Release);
+    }
+
+    /// Check if cancel_requested is set for the given generation.
+    ///
+    /// Returns true only if BOTH:
+    /// - The cancellation flag is set
+    /// - The stored generation matches the provided generation
+    ///
+    /// This prevents stale cancellations from affecting new guest calls.
+    fn is_cancel_requested_for_generation(&self, generation: u64) -> bool {
+        const CANCEL_REQUESTED_BIT: u64 = 1 << 63;
+        const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1;
+        let raw = self.get_cancel_requested().load(Ordering::Acquire);
+        let is_set = raw & CANCEL_REQUESTED_BIT != 0;
+        let stored_generation = raw & MAX_GENERATION;
+        is_set && stored_generation == generation
+    }
+
+    /// Set running bit to true, return current generation.
+    ///
+    /// This is called when the VCPU is about to enter guest mode. It atomically
+    /// sets the running flag while preserving the generation counter.
+    fn set_running_bit(&self) -> u64 {
+        const RUNNING_BIT: u64 = 1 << 63;
+        self.get_running()
+            .fetch_update(Ordering::Release, Ordering::Acquire, |raw| {
+                Some(raw | RUNNING_BIT)
+            })
+            .map(|raw| raw & !RUNNING_BIT) // Return the current generation
+            .unwrap_or(0)
+    }
+
+    /// Increment the generation for a new guest function call.
+    ///
+    /// The generation counter wraps around at MAX_GENERATION (2^63 - 1).
+    /// This is called at the start of each new guest function call to provide
+    /// a unique identifier that prevents ABA problems with stale cancellations.
+    ///
+    /// Returns the NEW generation number (after incrementing).
+    fn increment_generation(&self) -> u64 {
+        const RUNNING_BIT: u64 = 1 << 63;
+        const MAX_GENERATION: u64 = RUNNING_BIT - 1;
+        self.get_running()
+            .fetch_update(Ordering::Release, Ordering::Acquire, |raw| {
+                let current_generation = raw & !RUNNING_BIT;
+                let running_bit = raw & RUNNING_BIT;
+                if current_generation == MAX_GENERATION {
+                    // Restart generation from 0
+                    return Some(running_bit);
+                }
+                Some((current_generation + 1) | running_bit)
+            })
+            .map(|raw| (raw & !RUNNING_BIT) + 1) // Return the NEW generation
+            .unwrap_or(1) // If wrapped, return 1
+    }
+
+    /// Get the current running state and generation counter.
+    ///
+    /// Returns a tuple of (running, generation) where:
+    /// - running: true if VCPU is currently in guest mode
+    /// - generation: current generation counter value
+    fn get_running_and_generation(&self) -> (bool, u64) {
+        const RUNNING_BIT: u64 = 1 << 63;
+        let raw = self.get_running().load(Ordering::Acquire);
+        let running = raw & RUNNING_BIT != 0;
+        let generation = raw & !RUNNING_BIT;
+        (running, generation)
+    }
+
+    /// Clear the running bit and return the old value.
+    ///
+    /// This is called when the VCPU exits from guest mode back to host mode.
+    /// The return value (which includes the generation and the old running bit)
+    /// is currently unused by all callers.
+    fn clear_running_bit(&self) -> u64 {
+        const RUNNING_BIT: u64 = 1 << 63;
+        self.get_running()
+            .fetch_and(!RUNNING_BIT, Ordering::Release)
+    }
+}
+
 #[cfg(any(kvm, mshv))]
 #[derive(Debug)]
 pub(super) struct LinuxInterruptHandle {
-    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
+    /// Atomic flag combining running state and generation counter.
     ///
-    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
+    /// **Bit 63**: VCPU running state (1 = running, 0 = not running)
+    /// **Bits 0-62**: Generation counter (incremented once per guest function call)
     ///
-    /// This prevents an ABA problem where:
-    /// 1. The VCPU is running (generation N),
-    /// 2. It gets cancelled,
-    /// 3. Then quickly restarted (generation N+1),
-    ///    before the original thread has observed that it was cancelled.
+    /// # Generation Tracking
     ///
-    /// Without this generation counter, the interrupt logic might assume the VCPU is still
-    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
-    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
-    /// potentially causing an infinite loop where no effective interrupt is delivered.
+    /// The generation counter is incremented once at the start of each guest function call
+    /// and remains constant throughout that call, even if the VCPU is run multiple times
+    /// (due to host function calls, retries, etc.). This design solves the race condition
+    /// where a kill() from a previous call could spuriously cancel a new call.
     ///
-    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
+    /// ## Why Generations Are Needed
+    ///
+    /// Consider this scenario WITHOUT generation tracking:
+    /// 1. Thread A starts guest call 1, VCPU runs
+    /// 2. Thread B calls kill(), sends signal to Thread A
+    /// 3. Guest call 1 completes before signal arrives
+    /// 4. Thread A starts guest call 2, VCPU runs again
+    /// 5. Stale signal from step 2 arrives and incorrectly cancels call 2
+    ///
+    /// WITH generation tracking:
+    /// 1. Thread A starts guest call 1 (generation N), VCPU runs
+    /// 2. Thread B calls kill(), stamps cancel_requested with generation N
+    /// 3. Guest call 1 completes, signal may or may not have arrived yet
+    /// 4. Thread A starts guest call 2 (generation N+1), VCPU runs again
+    /// 5. If stale signal arrives, signal handler checks: cancel_requested.generation (N) != current generation (N+1)
+    /// 6. Stale signal is ignored, call 2 continues normally
+    ///
+    /// ## Per-Call vs Per-Run Generation
+    ///
+    /// It's critical that generation is incremented per GUEST FUNCTION CALL, not per vcpu.run():
+    /// - A single guest function call may invoke vcpu.run() multiple times (host calls, retries)
+    /// - All run() calls within the same guest call must share the same generation
+    /// - This ensures kill() affects the entire guest function call atomically
+    ///
+    /// # Invariants
+    ///
+    /// - If VCPU is running: bit 63 is set (neither converse nor inverse holds)
+    /// - If VCPU is running: bits 0-62 match the current guest call's generation
     running: AtomicU64,
-    /// Invariant: vcpu is running => `tid` is the thread on which it is running.
-    /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
+
+    /// Thread ID where the VCPU is currently running.
+    ///
+    /// # Invariants
+    ///
+    /// - If VCPU is running: tid contains the thread ID of the executing thread
+    /// - Multiple VMs may share the same tid, but at most one will have running=true
     tid: AtomicU64,
-    /// True when an "interruptor" has requested the VM to be cancelled. Set immediately when
-    /// `kill()` is called, and cleared when the vcpu is no longer running.
-    /// This is used to
-    /// 1. make sure stale signals do not interrupt the
-    ///    the wrong vcpu (a vcpu may only be interrupted iff `cancel_requested` is true),
-    /// 2. ensure that if a vm is killed while a host call is running,
-    ///    the vm will not re-enter the guest after the host call returns.
-    cancel_requested: AtomicBool,
-    /// True when the debugger has requested the VM to be interrupted. Set immediately when
-    /// `kill_from_debugger()` is called, and cleared when the vcpu is no longer running.
-    /// This is used to make sure stale signals do not interrupt the the wrong vcpu
-    /// (a vcpu may only be interrupted by a debugger if `debug_interrupt` is true),
+
+    /// Generation-aware cancellation request flag.
+    ///
+    /// **Bit 63**: Cancellation requested flag (1 = kill requested, 0 = no kill)
+    /// **Bits 0-62**: Generation number when cancellation was requested
+    ///
+    /// # Purpose
+    ///
+    /// This flag serves three critical functions:
+    ///
+    /// 1. **Prevent stale signals**: A VCPU may only be interrupted if cancel_requested
+    ///    is set AND the generation matches the current call's generation
+    ///
+    /// 2. **Handle host function calls**: If kill() is called while a host function is
+    ///    executing (VCPU not running but call is active), cancel_requested is stamped
+    ///    with the current generation. When the host function returns and the VCPU
+    ///    attempts to re-enter the guest, it will see the cancellation and abort.
+    ///
+    /// 3. **Detect stale kills**: If cancel_requested.generation doesn't match the
+    ///    current generation, it's from a previous call and should be ignored
+    ///
+    /// # States and Transitions
+    ///
+    /// - **No cancellation**: cancel_requested = 0 (bit 63 clear)
+    /// - **Cancellation for generation N**: cancel_requested = (1 << 63) | N
+    /// - Signal handler checks: (cancel_requested & 0x7FFFFFFFFFFFFFFF) == current_generation
+    cancel_requested: AtomicU64,
+
+    /// Flag indicating whether a guest function call is currently in progress.
+    ///
+    /// **true**: A guest function call is active (between call start and completion)
+    /// **false**: No guest function call is active
+    ///
+    /// # Purpose
+    ///
+    /// This flag prevents kill() from having any effect when called outside of a
+    /// guest function call. This solves the "kill-in-advance" problem where kill()
+    /// could be called before a guest function starts and would incorrectly cancel it.
+    ///
+    /// # Behavior
+    ///
+    /// - Set to true at the start of call_guest_function_by_name_no_reset()
+    /// - Cleared at the end of call_guest_function_by_name_no_reset()
+    /// - kill() only stamps cancel_requested if call_active is true
+    /// - If kill() is called when call_active=false, it returns false and has no effect
+    ///
+    /// # Why AtomicBool is Safe
+    ///
+    /// Although there's a theoretical race where:
+    /// 1. Thread A checks call_active (false)
+    /// 2. Thread B sets call_active (true) and starts guest call
+    /// 3. Thread A's kill() returns false (no effect)
+    ///
+    /// This is acceptable because the generation tracking provides an additional
+    /// safety layer. Even if a stale kill somehow stamped cancel_requested, the
+    /// generation mismatch would cause it to be ignored.
+    call_active: AtomicBool,
+
+    /// Debugger interrupt request flag (GDB only).
+    ///
+    /// Set when kill_from_debugger() is called, cleared when VCPU stops running.
+    /// Used to distinguish debugger interrupts from normal kill() interrupts.
     #[cfg(gdb)]
     debug_interrupt: AtomicBool,
-    /// Whether the corresponding vm is dropped
+
+    /// Whether the corresponding VM has been dropped.
     dropped: AtomicBool,
-    /// Retry delay between signals sent to the vcpu thread
+
+    /// Delay between retry attempts when sending signals to the VCPU thread.
     retry_delay: Duration,
-    /// The offset of the SIGRTMIN signal used to interrupt the vcpu thread
+
+    /// Offset from SIGRTMIN for the signal used to interrupt the VCPU thread.
     sig_rt_min_offset: u8,
 }
 
 #[cfg(any(kvm, mshv))]
 impl LinuxInterruptHandle {
-    const RUNNING_BIT: u64 = 1 << 63;
-    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
-
-    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
-    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
-        self.running
-            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
-                let generation = raw & !Self::RUNNING_BIT;
-                if generation == Self::MAX_GENERATION {
-                    // restart generation from 0
-                    return Some(Self::RUNNING_BIT);
-                }
-                Some((generation + 1) | Self::RUNNING_BIT)
-            })
-    }
-
-    // clear the running bit and return the generation
-    fn clear_running_bit(&self) -> u64 {
-        self.running
-            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
-    }
-
-    fn get_running_and_generation(&self) -> (bool, u64) {
-        let raw = self.running.load(Ordering::Relaxed);
-        let running = raw & Self::RUNNING_BIT != 0;
-        let generation = raw & !Self::RUNNING_BIT;
-        (running, generation)
-    }
-
-    fn send_signal(&self) -> bool {
+    fn send_signal(&self, stamp_generation: bool) -> bool {
         let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
         let mut sent_signal = false;
         let mut target_generation: Option<u64> = None;
 
         loop {
+            if !self.call_active.load(Ordering::Acquire) {
+                // No active call, so no need to send signal
+                break;
+            }
+
             let (running, generation) = self.get_running_and_generation();
 
+            // Stamp generation into cancel_requested if requested and this is the first iteration
+            // We stamp even when running=false to support killing during host function calls
+            // The generation tracking will prevent stale kills from affecting new calls
+            // Only stamp if a call is actually active (call_active=true)
+            if stamp_generation
+                && target_generation.is_none()
+                && self.call_active.load(Ordering::Acquire)
+            {
+                self.set_cancel_requested(generation);
+                target_generation = Some(generation);
+            }
+
+            // If not running, we've stamped the generation (if requested), so we're done
+            // This handles the host function call scenario
             if !running {
                 break;
             }
@@ -495,7 +751,7 @@ impl LinuxInterruptHandle {
             log::info!("Sending signal to kill vcpu thread...");
             sent_signal = true;
             unsafe {
-                libc::pthread_kill(self.tid.load(Ordering::Relaxed) as _, signal_number);
+                libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
             }
             std::thread::sleep(self.retry_delay);
         }
@@ -507,20 +763,42 @@ impl LinuxInterruptHandle {
 #[cfg(any(kvm, mshv))]
 impl InterruptHandle for LinuxInterruptHandle {
     fn kill(&self) -> bool {
-        self.cancel_requested.store(true, Ordering::Relaxed);
+        if !(self.call_active.load(Ordering::Acquire)) {
+            // No active call, so no effect
+            return false;
+        }
 
-        self.send_signal()
+        // send_signal will stamp the generation into cancel_requested
+        // right before sending each signal, ensuring they're always in sync
+        self.send_signal(true)
     }
+
     #[cfg(gdb)]
     fn kill_from_debugger(&self) -> bool {
         self.debug_interrupt.store(true, Ordering::Relaxed);
-        self.send_signal()
+        self.send_signal(false)
     }
+
     fn dropped(&self) -> bool {
         self.dropped.load(Ordering::Relaxed)
     }
 }
 
+#[cfg(any(kvm, mshv))]
+impl InterruptHandleInternal for LinuxInterruptHandle {
+    fn get_call_active(&self) -> &AtomicBool {
+        &self.call_active
+    }
+
+    fn get_running(&self) -> &AtomicU64 {
+        &self.running
+    }
+
+    fn get_cancel_requested(&self) -> &AtomicU64 {
+        &self.cancel_requested
+    }
+}
+
 #[cfg(all(test, any(target_os = "windows", kvm)))]
 pub(crate) mod tests {
     use std::sync::{Arc, Mutex};
diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs
index 82a920158..708f5b0a5 100644
--- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs
+++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs
@@ -45,18 +45,90 @@ use crate::mem::ptr::RawPtr;
 use crate::mem::shared_mem::HostSharedMemory;
 use crate::metrics::maybe_time_and_emit_guest_call;
 use crate::sandbox::mem_mgr::MemMgrWrapper;
-use crate::{Result, log_then_return};
+use crate::{Result, log_then_return, new_error};
 
 /// Global counter for assigning unique IDs to sandboxes
 static SANDBOX_ID_COUNTER: AtomicU64 = AtomicU64::new(0);
 
+/// RAII guard that automatically calls `clear_call_active()` when dropped.
+///
+/// This ensures that the call_active flag is always cleared when a guest function
+/// call completes, even if the function returns early due to an error.
+///
+/// Only one guard can exist per interrupt handle at a time - attempting to create
+/// a second guard will return an error.
+struct CallActiveGuard<T: crate::hypervisor::InterruptHandleInternal + ?Sized> {
+    interrupt_handle: Arc<T>,
+}
+
+impl<T: crate::hypervisor::InterruptHandleInternal + ?Sized> CallActiveGuard<T> {
+    /// Creates a new guard and marks a guest function call as active.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `call_active` is already true (i.e., another guard already exists).
+    fn new(interrupt_handle: Arc<T>) -> Result<Self> {
+        // Atomically check that call_active is false and set it to true.
+        // This prevents creating multiple guards for the same interrupt handle.
+        let was_active = interrupt_handle.set_call_active();
+        if was_active {
+            return Err(new_error!(
+                "Attempted to create CallActiveGuard when a call is already active"
+            ));
+        }
+        Ok(Self { interrupt_handle })
+    }
+}
+
+impl<T: crate::hypervisor::InterruptHandleInternal + ?Sized> Drop for CallActiveGuard<T> {
+    fn drop(&mut self) {
+        self.interrupt_handle.clear_call_active();
+    }
+}
+
 /// A fully initialized sandbox that can execute guest functions multiple times.
 ///
 /// Guest functions can be called repeatedly while maintaining state between calls.
 /// The sandbox supports creating snapshots and restoring to previous states.
+///
+/// ## Sandbox Poisoning
+///
+/// The sandbox becomes **poisoned** when the guest is not run to completion, leaving it in
+/// an inconsistent state that could compromise memory safety, data integrity, or security.
+///
+/// ### When Does Poisoning Occur?
+///
+/// Poisoning happens when guest execution is interrupted before normal completion:
+///
+/// - **Guest panics or aborts** - When a guest function panics, crashes, or calls `abort()`,
+///   the normal cleanup and unwinding process is interrupted
+/// - **Invalid memory access** - Attempts to read/write/execute memory outside allowed regions
+/// - **Stack overflow** - Guest exhausts its stack space during execution
+/// - **Heap exhaustion** - Guest runs out of heap memory
+/// - **Host-initiated cancellation** - Calling [`InterruptHandle::kill()`] to forcefully
+///   terminate an in-progress guest function
+///
+/// ### Why This Is Unsafe
+///
+/// When guest execution doesn't complete normally, critical cleanup operations are skipped:
+///
+/// - **Memory leaks** - Heap allocations remain unreachable as the call stack is unwound
+/// - **Corrupted allocator state** - Memory allocator metadata (free lists, heap headers)
+///   left inconsistent
+/// - **Locked resources** - Mutexes or other synchronization primitives remain locked
+/// - **Partial state updates** - Data structures left half-modified (corrupted linked lists,
+///   inconsistent hash tables, etc.)
+///
+/// ### Recovery
+///
+/// Use [`restore()`](Self::restore) with a snapshot taken before poisoning occurred.
+/// This is the **only safe way** to recover - it completely replaces all memory state,
+/// eliminating any inconsistencies. See [`restore()`](Self::restore) for details.
 pub struct MultiUseSandbox {
     /// Unique identifier for this sandbox instance
     id: u64,
+    /// Whether this sandbox is poisoned
+    poisoned: bool,
     // We need to keep a reference to the host functions, even if the compiler marks it as unused. The compiler cannot detect our dynamic usages of the host function in `HyperlightFunction::call`.
     pub(super) _host_funcs: Arc<Mutex<FunctionRegistry>>,
     pub(crate) mem_mgr: MemMgrWrapper<HostSharedMemory>,
@@ -85,6 +157,7 @@ impl MultiUseSandbox {
     ) -> MultiUseSandbox {
         Self {
             id: SANDBOX_ID_COUNTER.fetch_add(1, Ordering::Relaxed),
+            poisoned: false,
             _host_funcs: host_funcs,
             mem_mgr: mgr,
             vm,
@@ -100,6 +173,11 @@ impl MultiUseSandbox {
     /// The snapshot is tied to this specific sandbox instance and can only be
     /// restored to the same sandbox it was created from.
     ///
+    /// ## Poisoned Sandbox
+    ///
+    /// This method will return [`crate::HyperlightError::PoisonedSandbox`] if the sandbox
+    /// is currently poisoned. Snapshots can only be taken from non-poisoned sandboxes.
+    ///
     /// # Examples
     ///
     /// ```no_run
@@ -120,6 +198,10 @@ impl MultiUseSandbox {
     /// ```
     #[instrument(err(Debug), skip_all, parent = Span::current())]
     pub fn snapshot(&mut self) -> Result<Snapshot> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
+
         if let Some(snapshot) = &self.snapshot {
             return Ok(snapshot.clone());
         }
@@ -141,6 +223,22 @@ impl MultiUseSandbox {
     /// Attempting to restore a snapshot from a different sandbox will return
     /// a [`SnapshotSandboxMismatch`](crate::HyperlightError::SnapshotSandboxMismatch) error.
     ///
+    /// ## Poison State Recovery
+    ///
+    /// This method automatically clears any poison state when successful. This is safe because:
+    /// - Snapshots can only be taken from non-poisoned sandboxes
+    /// - Restoration completely replaces all memory state, eliminating any inconsistencies
+    ///   caused by incomplete guest execution
+    ///
+    /// ### What Gets Fixed During Restore
+    ///
+    /// When a poisoned sandbox is restored, the memory state is completely reset:
+    /// - **Leaked heap memory** - All allocations from interrupted execution are discarded
+    /// - **Corrupted allocator metadata** - Free lists and heap headers restored to consistent state
+    /// - **Locked mutexes** - All lock state is reset
+    /// - **Partial updates** - Data structures restored to their pre-execution state
+    ///
+    ///
     /// # Examples
     ///
     /// ```no_run
@@ -166,6 +264,35 @@ impl MultiUseSandbox {
     /// # Ok(())
     /// # }
     /// ```
+    ///
+    /// ## Recovering from Poison
+    ///
+    /// ```no_run
+    /// # use hyperlight_host::{MultiUseSandbox, UninitializedSandbox, GuestBinary, HyperlightError};
+    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
+    /// let mut sandbox: MultiUseSandbox = UninitializedSandbox::new(
+    ///     GuestBinary::FilePath("guest.bin".into()),
+    ///     None
+    /// )?.evolve()?;
+    ///
+    /// // Take snapshot before potentially poisoning operation
+    /// let snapshot = sandbox.snapshot()?;
+    ///
+    /// // This might poison the sandbox (guest not run to completion)
+    /// let result = sandbox.call::<()>("guest_panic", ());
+    /// if result.is_err() {
+    ///     if sandbox.poisoned() {
+    ///         // Restore from snapshot to clear poison
+    ///         sandbox.restore(&snapshot)?;
+    ///         assert!(!sandbox.poisoned());
+    ///         
+    ///         // Sandbox is now usable again
+    ///         sandbox.call::<String>("Echo", "hello".to_string())?;
+    ///     }
+    /// }
+    /// # Ok(())
+    /// # }
+    /// ```
     #[instrument(err(Debug), skip_all, parent = Span::current())]
     pub fn restore(&mut self, snapshot: &Snapshot) -> Result<()> {
         if let Some(snap) = &self.snapshot {
@@ -200,6 +327,17 @@ impl MultiUseSandbox {
         // The restored snapshot is now our most current snapshot
         self.snapshot = Some(snapshot.clone());
 
+        // Clear poison state when successfully restoring from snapshot.
+        //
+        // # Safety:
+        // This is safe because:
+        // 1. Snapshots can only be taken from non-poisoned sandboxes (verified at snapshot creation)
+        // 2. Restoration completely replaces all memory state, eliminating:
+        //    - All leaked heap allocations (memory is restored to snapshot state)
+        //    - All corrupted data structures (overwritten with consistent snapshot data)
+        //    - All inconsistent global state (reset to snapshot values)
+        self.poisoned = false;
+
         Ok(())
     }
 
@@ -207,6 +345,11 @@ impl MultiUseSandbox {
     ///
     /// Changes made to the sandbox during execution are *not* persisted.
     ///
+    /// ## Poisoned Sandbox
+    ///
+    /// This method will return [`crate::HyperlightError::PoisonedSandbox`] if the sandbox
+    /// is currently poisoned. Use [`restore()`](Self::restore) to recover from a poisoned state.
+    ///
     /// # Examples
     ///
     /// ```no_run
@@ -245,6 +388,9 @@ impl MultiUseSandbox {
         func_name: &str,
         args: impl ParameterTuple,
     ) -> Result<Output> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
         let snapshot = self.snapshot()?;
         let res = self.call(func_name, args);
         self.restore(&snapshot)?;
@@ -255,6 +401,22 @@ impl MultiUseSandbox {
     ///
     /// Changes made to the sandbox during execution are persisted.
     ///
+    /// ## Poisoned Sandbox
+    ///
+    /// This method will return [`crate::HyperlightError::PoisonedSandbox`] if the sandbox
+    /// is already poisoned before the call. Use [`restore()`](Self::restore) to recover from
+    /// a poisoned state.
+    ///
+    /// ## Sandbox Poisoning
+    ///
+    /// If this method returns an error, the sandbox may be poisoned if the guest was not run
+    /// to completion (due to panic, abort, memory violation, stack/heap exhaustion, or forced
+    /// termination). Use [`poisoned()`](Self::poisoned) to check the poison state and
+    /// [`restore()`](Self::restore) to recover if needed.
+    ///
+    /// If this method returns `Ok`, the sandbox is guaranteed to **not** be poisoned - the guest
+    /// function completed successfully and the sandbox state is consistent.
+    ///
     /// # Examples
     ///
     /// ```no_run
@@ -282,12 +444,44 @@ impl MultiUseSandbox {
     /// # Ok(())
     /// # }
     /// ```
+    ///
+    /// ## Handling Potential Poisoning
+    ///
+    /// ```no_run
+    /// # use hyperlight_host::{MultiUseSandbox, UninitializedSandbox, GuestBinary};
+    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
+    /// let mut sandbox: MultiUseSandbox = UninitializedSandbox::new(
+    ///     GuestBinary::FilePath("guest.bin".into()),
+    ///     None
+    /// )?.evolve()?;
+    ///
+    /// // Take snapshot before risky operation
+    /// let snapshot = sandbox.snapshot()?;
+    ///
+    /// // Call potentially unsafe guest function
+    /// let result = sandbox.call::<String>("RiskyOperation", "input".to_string());
+    ///
+    /// // Check if the call failed and poisoned the sandbox
+    /// if let Err(e) = result {
+    ///     eprintln!("Guest function failed: {}", e);
+    ///     
+    ///     if sandbox.poisoned() {
+    ///         eprintln!("Sandbox was poisoned, restoring from snapshot");
+    ///         sandbox.restore(&snapshot)?;
+    ///     }
+    /// }
+    /// # Ok(())
+    /// # }
+    /// ```
     #[instrument(err(Debug), skip(self, args), parent = Span::current())]
     pub fn call<Output: SupportedReturnType>(
         &mut self,
         func_name: &str,
         args: impl ParameterTuple,
     ) -> Result<Output> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
         // Reset snapshot since we are mutating the sandbox state
         self.snapshot = None;
         maybe_time_and_emit_guest_call(func_name, || {
@@ -306,12 +500,20 @@ impl MultiUseSandbox {
     /// (typically page-aligned). The `region_type` field is ignored as guest
     /// page table entries are not created.
     ///
+    /// ## Poisoned Sandbox
+    ///
+    /// This method will return [`crate::HyperlightError::PoisonedSandbox`] if the sandbox
+    /// is currently poisoned. Use [`restore()`](Self::restore) to recover from a poisoned state.
+    ///
     /// # Safety
     ///
     /// The caller must ensure the host memory region remains valid and unmodified
     /// for the lifetime of `self`.
     #[instrument(err(Debug), skip(self, rgn), parent = Span::current())]
     pub unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
         if rgn.flags.contains(MemoryRegionFlags::STACK_GUARD) {
             // Stack guard pages are an internal implementation detail
             // (which really should be moved into the guest)
@@ -333,9 +535,16 @@ impl MultiUseSandbox {
     /// Map the contents of a file into the guest at a particular address
     ///
     /// Returns the length of the mapping in bytes.
-    #[allow(dead_code)]
+    ///
+    /// ## Poisoned Sandbox
+    ///
+    /// This method will return [`crate::HyperlightError::PoisonedSandbox`] if the sandbox
+    /// is currently poisoned. Use [`restore()`](Self::restore) to recover from a poisoned state.
     #[instrument(err(Debug), skip(self, _fp, _guest_base), parent = Span::current())]
     pub fn map_file_cow(&mut self, _fp: &Path, _guest_base: u64) -> Result<u64> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
         #[cfg(windows)]
         log_then_return!("mmap'ing a file into the guest is not yet supported on Windows");
         #[cfg(unix)]
@@ -373,6 +582,11 @@ impl MultiUseSandbox {
     /// Calls a guest function with type-erased parameters and return values.
     ///
     /// This function is used for fuzz testing parameter and return type handling.
+    ///
+    /// ## Poisoned Sandbox
+    ///
+    /// This method will return [`crate::HyperlightError::PoisonedSandbox`] if the sandbox
+    /// is currently poisoned. Use [`restore()`](Self::restore) to recover from a poisoned state.
     #[cfg(feature = "fuzzing")]
     #[instrument(err(Debug), skip(self, args), parent = Span::current())]
     pub fn call_type_erased_guest_function_by_name(
@@ -381,6 +595,9 @@ impl MultiUseSandbox {
         ret_type: ReturnType,
         args: Vec<ParameterValue>,
     ) -> Result<ReturnValue> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
         // Reset snapshot since we are mutating the sandbox state
         self.snapshot = None;
         maybe_time_and_emit_guest_call(func_name, || {
@@ -394,6 +611,14 @@ impl MultiUseSandbox {
         return_type: ReturnType,
         args: Vec<ParameterValue>,
     ) -> Result<ReturnValue> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
+        // Mark that a guest function call is now active
+        // (This also increments the generation counter internally)
+        // The guard will automatically clear call_active when dropped
+        let _guard = CallActiveGuard::new(self.vm.interrupt_handle())?;
+
         let res = (|| {
             let estimated_capacity = estimate_flatbuffer_capacity(function_name, &args);
 
@@ -428,6 +653,13 @@ impl MultiUseSandbox {
         // TODO: Do we want to allow re-entrant guest function calls?
         self.get_mgr_wrapper_mut().as_mut().clear_io_buffers();
 
+        if let Err(e) = &res {
+            // Determine if we should poison the sandbox.
+            self.poisoned |= e.is_poison_error();
+        }
+
+        // Note: clear_call_active() is automatically called when _guard is dropped here
+
         res
     }
 
@@ -463,6 +695,46 @@ impl MultiUseSandbox {
     pub fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
         self.vm.interrupt_handle()
     }
+
+    /// Returns whether the sandbox is currently poisoned.
+    ///
+    /// A poisoned sandbox is in an inconsistent state due to the guest not running to completion.
+    /// All operations will be rejected until the sandbox is restored from a non-poisoned snapshot.
+    ///
+    /// ## Causes of Poisoning
+    ///
+    /// The sandbox becomes poisoned when guest execution is interrupted:
+    /// - **Panics/Aborts** - Guest code panics or calls `abort()`
+    /// - **Invalid Memory Access** - Read/write/execute violations  
+    /// - **Stack Overflow** - Guest exhausts stack space
+    /// - **Heap Exhaustion** - Guest runs out of heap memory
+    /// - **Forced Termination** - [`InterruptHandle::kill()`] called during execution
+    ///
+    /// ## Recovery
+    ///
+    /// To clear the poison state, use [`restore()`](Self::restore) with a snapshot
+    /// that was taken before the sandbox became poisoned.
+    ///
+    /// # Examples
+    ///
+    /// ```no_run
+    /// # use hyperlight_host::{MultiUseSandbox, UninitializedSandbox, GuestBinary};
+    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
+    /// let mut sandbox: MultiUseSandbox = UninitializedSandbox::new(
+    ///     GuestBinary::FilePath("guest.bin".into()),
+    ///     None
+    /// )?.evolve()?;
+    ///
+    /// // Check if sandbox is poisoned
+    /// if sandbox.poisoned() {
+    ///     println!("Sandbox is poisoned and needs attention");
+    /// }
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn poisoned(&self) -> bool {
+        self.poisoned
+    }
 }
 
 impl Callable for MultiUseSandbox {
@@ -471,6 +743,9 @@ impl Callable for MultiUseSandbox {
         func_name: &str,
         args: impl ParameterTuple,
     ) -> Result<Output> {
+        if self.poisoned {
+            return Err(crate::HyperlightError::PoisonedSandbox);
+        }
         self.call(func_name, args)
     }
 }
@@ -497,6 +772,7 @@ mod tests {
     use std::sync::{Arc, Barrier};
     use std::thread;
 
+    use hyperlight_common::flatbuffer_wrappers::guest_error::ErrorCode;
     use hyperlight_testing::simple_guest_as_string;
 
     #[cfg(target_os = "linux")]
@@ -506,6 +782,96 @@ mod tests {
     use crate::sandbox::SandboxConfiguration;
     use crate::{GuestBinary, HyperlightError, MultiUseSandbox, Result, UninitializedSandbox};
 
+    #[test]
+    fn poison() {
+        let mut sbox: MultiUseSandbox = {
+            let path = simple_guest_as_string().unwrap();
+            let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap();
+            u_sbox.evolve()
+        }
+        .unwrap();
+        let snapshot = sbox.snapshot().unwrap();
+
+        // poison on purpose
+        let res = sbox
+            .call::<()>("guest_panic", "hello".to_string())
+            .unwrap_err();
+        assert!(
+            matches!(res, HyperlightError::GuestAborted(code, context) if code == ErrorCode::UnknownError as u8 && context.contains("hello"))
+        );
+        assert!(sbox.poisoned());
+
+        // guest calls should fail when poisoned
+        let res = sbox
+            .call::<()>("guest_panic", "hello2".to_string())
+            .unwrap_err();
+        assert!(matches!(res, HyperlightError::PoisonedSandbox));
+
+        // snapshot should fail when poisoned
+        if let Err(e) = sbox.snapshot() {
+            assert!(sbox.poisoned());
+            assert!(matches!(e, HyperlightError::PoisonedSandbox));
+        } else {
+            panic!("Snapshot should fail");
+        }
+
+        // map_region should fail when poisoned
+        #[cfg(target_os = "linux")]
+        {
+            let map_mem = allocate_guest_memory();
+            let guest_base = 0x0;
+            let region = region_for_memory(&map_mem, guest_base, MemoryRegionFlags::READ);
+            let res = unsafe { sbox.map_region(&region) }.unwrap_err();
+            assert!(matches!(res, HyperlightError::PoisonedSandbox));
+        }
+
+        // map_file_cow should fail when poisoned
+        #[cfg(target_os = "linux")]
+        {
+            let temp_file = std::env::temp_dir().join("test_poison_map_file.bin");
+            let res = sbox.map_file_cow(&temp_file, 0x0).unwrap_err();
+            assert!(matches!(res, HyperlightError::PoisonedSandbox));
+            std::fs::remove_file(&temp_file).ok(); // Clean up
+        }
+
+        // call_guest_function_by_name (deprecated) should fail when poisoned
+        #[allow(deprecated)]
+        let res = sbox
+            .call_guest_function_by_name::<String>("Echo", "test".to_string())
+            .unwrap_err();
+        assert!(matches!(res, HyperlightError::PoisonedSandbox));
+
+        // restore to non-poisoned snapshot should work and clear poison
+        sbox.restore(&snapshot).unwrap();
+        assert!(!sbox.poisoned());
+
+        // guest calls should work again after restore
+        let res = sbox.call::<String>("Echo", "hello2".to_string()).unwrap();
+        assert_eq!(res, "hello2".to_string());
+        assert!(!sbox.poisoned());
+
+        // re-poison on purpose
+        let res = sbox
+            .call::<()>("guest_panic", "hello".to_string())
+            .unwrap_err();
+        assert!(
+            matches!(res, HyperlightError::GuestAborted(code, context) if code == ErrorCode::UnknownError as u8 && context.contains("hello"))
+        );
+        assert!(sbox.poisoned());
+
+        // restore to non-poisoned snapshot should work again
+        sbox.restore(&snapshot).unwrap();
+        assert!(!sbox.poisoned());
+
+        // guest calls should work again
+        let res = sbox.call::<String>("Echo", "hello3".to_string()).unwrap();
+        assert_eq!(res, "hello3".to_string());
+        assert!(!sbox.poisoned());
+
+        // snapshot should work again
+        let _ = sbox.snapshot().unwrap();
+    }
+
     /// Tests that call_guest_function_by_name restores the state correctly
     #[test]
     fn test_call_guest_function_by_name() {
diff --git a/src/hyperlight_host/tests/integration_test.rs b/src/hyperlight_host/tests/integration_test.rs
index eceabd479..26d4c5467 100644
--- a/src/hyperlight_host/tests/integration_test.rs
+++ b/src/hyperlight_host/tests/integration_test.rs
@@ -60,6 +60,7 @@ fn interrupt_host_call() {
         .unwrap();
 
     let mut sandbox: MultiUseSandbox = usbox.evolve().unwrap();
+    let snapshot = sandbox.snapshot().unwrap();
     let interrupt_handle = sandbox.interrupt_handle();
     assert!(!interrupt_handle.dropped()); // not yet dropped
 
@@ -72,6 +73,11 @@ fn interrupt_host_call() {
 
     let result = sandbox.call::<i32>("CallHostSpin", ()).unwrap_err();
     assert!(matches!(result, HyperlightError::ExecutionCanceledByHost()));
+    assert!(sandbox.poisoned());
+
+    // Restore from snapshot to clear poison
+    sandbox.restore(&snapshot).unwrap();
+    assert!(!sandbox.poisoned());
 
     thread.join().unwrap();
 }
@@ -80,6 +86,7 @@ fn interrupt_host_call() {
 #[test]
 fn interrupt_in_progress_guest_call() {
     let mut sbox1: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
+    let snapshot = sbox1.snapshot().unwrap();
     let barrier = Arc::new(Barrier::new(2));
     let barrier2 = barrier.clone();
     let interrupt_handle = sbox1.interrupt_handle();
@@ -96,6 +103,11 @@ fn interrupt_in_progress_guest_call() {
 
     let res = sbox1.call::<i32>("Spin", ()).unwrap_err();
     assert!(matches!(res, HyperlightError::ExecutionCanceledByHost()));
+    assert!(sbox1.poisoned());
+
+    // Restore from snapshot to clear poison
+    sbox1.restore(&snapshot).unwrap();
+    assert!(!sbox1.poisoned());
 
     barrier.wait();
     // Make sure we can still call guest functions after the VM was interrupted
@@ -107,7 +119,7 @@ fn interrupt_in_progress_guest_call() {
     thread.join().expect("Thread should finish");
 }
 
-/// Makes sure interrupting a vm before the guest call has started also prevents the guest call from being executed
+/// Makes sure interrupting a vm before the guest call has started does not prevent the guest call from running
 #[test]
 fn interrupt_guest_call_in_advance() {
     let mut sbox1: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
@@ -125,10 +137,16 @@ fn interrupt_guest_call_in_advance() {
     });
 
     barrier.wait(); // wait until `kill()` is called before starting the guest call
-    let res = sbox1.call::<i32>("Spin", ()).unwrap_err();
-    assert!(matches!(res, HyperlightError::ExecutionCanceledByHost()));
+    match sbox1.call::<String>("Echo", "hello".to_string()) {
+        Ok(_) => {}
+        Err(HyperlightError::ExecutionCanceledByHost()) => {
+            panic!("Unexpected Cancellation Error");
+        }
+        Err(_) => {}
+    }
 
-    // Make sure we can still call guest functions after the VM was interrupted
+    // Make sure we can still call guest functions after the VM was interrupted early
+    // i.e. make sure we dont kill the next iteration.
     sbox1.call::<String>("Echo", "hello".to_string()).unwrap();
 
     // drop vm to make sure other thread can detect it
@@ -150,6 +168,7 @@ fn interrupt_guest_call_in_advance() {
 fn interrupt_same_thread() {
     let mut sbox1: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
     let mut sbox2: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
+    let snapshot2 = sbox2.snapshot().unwrap();
     let mut sbox3: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
 
     let barrier = Arc::new(Barrier::new(2));
@@ -174,12 +193,14 @@ fn interrupt_same_thread() {
             .call::<String>("Echo", "hello".to_string())
             .expect("Only sandbox 2 is allowed to be interrupted");
         match sbox2.call::<String>("Echo", "hello".to_string()) {
-            Ok(_) | Err(HyperlightError::ExecutionCanceledByHost()) => {
-                // Only allow successful calls or interrupted.
-                // The call can be successful in case the call is finished before kill() is called.
-            }
+            // Only allow successful calls or interrupted.
+            // The call can be successful in case the call is finished before kill() is called.
+            Ok(_) | Err(HyperlightError::ExecutionCanceledByHost()) => {}
             _ => panic!("Unexpected return"),
         };
+        if sbox2.poisoned() {
+            sbox2.restore(&snapshot2).unwrap();
+        }
         sbox3
             .call::<String>("Echo", "hello".to_string())
             .expect("Only sandbox 2 is allowed to be interrupted");
@@ -192,6 +213,7 @@ fn interrupt_same_thread() {
 fn interrupt_same_thread_no_barrier() {
     let mut sbox1: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
     let mut sbox2: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
+    let snapshot2 = sbox2.snapshot().unwrap();
     let mut sbox3: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
 
     let barrier = Arc::new(Barrier::new(2));
@@ -218,12 +240,14 @@ fn interrupt_same_thread_no_barrier() {
             .call::<String>("Echo", "hello".to_string())
             .expect("Only sandbox 2 is allowed to be interrupted");
         match sbox2.call::<String>("Echo", "hello".to_string()) {
-            Ok(_) | Err(HyperlightError::ExecutionCanceledByHost()) => {
-                // Only allow successful calls or interrupted.
-                // The call can be successful in case the call is finished before kill() is called.
-            }
+            // Only allow successful calls or interrupted.
+            // The call can be successful in case the call is finished before kill() is called.
+            Ok(_) | Err(HyperlightError::ExecutionCanceledByHost()) => {}
             _ => panic!("Unexpected return"),
         };
+        if sbox2.poisoned() {
+            sbox2.restore(&snapshot2).unwrap();
+        }
         sbox3
             .call::<String>("Echo", "hello".to_string())
             .expect("Only sandbox 2 is allowed to be interrupted");
@@ -237,6 +261,7 @@ fn interrupt_same_thread_no_barrier() {
 #[test]
 fn interrupt_moved_sandbox() {
     let mut sbox1: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
+    let snapshot1 = sbox1.snapshot().unwrap();
     let mut sbox2: MultiUseSandbox = new_uninit_rust().unwrap().evolve().unwrap();
 
     let interrupt_handle = sbox1.interrupt_handle();
@@ -249,6 +274,9 @@ fn interrupt_moved_sandbox() {
         barrier2.wait();
         let res = sbox1.call::<i32>("Spin", ()).unwrap_err();
         assert!(matches!(res, HyperlightError::ExecutionCanceledByHost()));
+        assert!(sbox1.poisoned());
+        sbox1.restore(&snapshot1).unwrap();
+        assert!(!sbox1.poisoned());
     });
 
     let thread2 = thread::spawn(move || {
@@ -287,6 +315,7 @@ fn interrupt_custom_signal_no_and_retry_delay() {
     .evolve()
     .unwrap();
 
+    let snapshot1 = sbox1.snapshot().unwrap();
     let interrupt_handle = sbox1.interrupt_handle();
     assert!(!interrupt_handle.dropped()); // not yet dropped
 
@@ -303,8 +332,11 @@ fn interrupt_custom_signal_no_and_retry_delay() {
     for _ in 0..NUM_ITERS {
         let res = sbox1.call::<i32>("Spin", ()).unwrap_err();
         assert!(matches!(res, HyperlightError::ExecutionCanceledByHost()));
+        assert!(sbox1.poisoned());
         // immediately reenter another guest function call after having being cancelled,
         // so that the vcpu is running again before the interruptor-thread has a chance to see that the vcpu is not running
+        sbox1.restore(&snapshot1).unwrap();
+        assert!(!sbox1.poisoned());
     }
     thread.join().expect("Thread should finish");
 }
@@ -737,3 +769,467 @@ fn log_test_messages(levelfilter: Option<log::LevelFilter>) {
             .unwrap();
     }
 }
+
+/// Test that validates interrupt behavior with random kill timing under concurrent load
+/// Uses a pool of 100 sandboxes, 100 threads, and 500 iterations per thread.
+/// Randomly decides to kill some calls at random times during execution.
+/// Validates that:
+/// - Calls we chose to kill can end in any state (including some cancelled)
+/// - Calls we did NOT choose to kill NEVER return ExecutionCanceledByHost
+/// - We get a mix of killed and non-killed outcomes (not 100% or 0%)
+#[test]
+fn interrupt_random_kill_stress_test() {
+    // Wrapper to hold a sandbox and its snapshot together
+    struct SandboxWithSnapshot {
+        sandbox: MultiUseSandbox,
+        snapshot: Snapshot,
+    }
+
+    use std::collections::VecDeque;
+    use std::sync::Mutex;
+    use std::sync::atomic::AtomicUsize;
+
+    use hyperlight_host::sandbox::snapshot::Snapshot;
+    use log::{error, trace};
+
+    const POOL_SIZE: usize = 100;
+    const NUM_THREADS: usize = 100;
+    const ITERATIONS_PER_THREAD: usize = 500;
+    const KILL_PROBABILITY: f64 = 0.5; // 50% chance to attempt kill
+    const GUEST_CALL_DURATION_MS: u32 = 10; // SpinForMs duration
+
+    // Create a pool of 50 sandboxes
+    println!("Creating pool of {} sandboxes...", POOL_SIZE);
+    let mut sandbox_pool: Vec<SandboxWithSnapshot> = Vec::with_capacity(POOL_SIZE);
+    for i in 0..POOL_SIZE {
+        let mut sandbox = new_uninit_rust().unwrap().evolve().unwrap();
+        // Create a snapshot for this sandbox
+        let snapshot = sandbox.snapshot().unwrap();
+        if (i + 1) % 10 == 0 {
+            println!("Created {}/{} sandboxes", i + 1, POOL_SIZE);
+        }
+        sandbox_pool.push(SandboxWithSnapshot { sandbox, snapshot });
+    }
+
+    // Wrap the pool in Arc<Mutex<VecDeque>> for thread-safe access
+    let pool = Arc::new(Mutex::new(VecDeque::from(sandbox_pool)));
+
+    // Counters for statistics
+    let total_iterations = Arc::new(AtomicUsize::new(0));
+    let kill_attempted_count = Arc::new(AtomicUsize::new(0)); // We chose to kill
+    let actually_killed_count = Arc::new(AtomicUsize::new(0)); // Got ExecutionCanceledByHost
+    let not_killed_completed_ok = Arc::new(AtomicUsize::new(0));
+    let not_killed_error = Arc::new(AtomicUsize::new(0)); // Non-cancelled errors
+    let killed_but_completed_ok = Arc::new(AtomicUsize::new(0));
+    let killed_but_error = Arc::new(AtomicUsize::new(0)); // Non-cancelled errors
+    let unexpected_cancelled = Arc::new(AtomicUsize::new(0)); // CRITICAL: non-killed calls that got cancelled
+    let sandbox_replaced_count = Arc::new(AtomicUsize::new(0)); // Sandboxes replaced due to restore failure
+
+    println!(
+        "Starting {} threads with {} iterations each...",
+        NUM_THREADS, ITERATIONS_PER_THREAD
+    );
+
+    // Spawn worker threads
+    let mut thread_handles = vec![];
+    for thread_id in 0..NUM_THREADS {
+        let pool_clone = Arc::clone(&pool);
+        let total_iterations_clone = Arc::clone(&total_iterations);
+        let kill_attempted_count_clone = Arc::clone(&kill_attempted_count);
+        let actually_killed_count_clone = Arc::clone(&actually_killed_count);
+        let not_killed_completed_ok_clone = Arc::clone(&not_killed_completed_ok);
+        let not_killed_error_clone = Arc::clone(&not_killed_error);
+        let killed_but_completed_ok_clone = Arc::clone(&killed_but_completed_ok);
+        let killed_but_error_clone = Arc::clone(&killed_but_error);
+        let unexpected_cancelled_clone = Arc::clone(&unexpected_cancelled);
+        let sandbox_replaced_count_clone = Arc::clone(&sandbox_replaced_count);
+
+        let handle = thread::spawn(move || {
+            // Use thread_id as seed for reproducible randomness per thread
+            use std::collections::hash_map::RandomState;
+            use std::hash::{BuildHasher, Hash};
+
+            let mut hasher = RandomState::new().build_hasher();
+            thread_id.hash(&mut hasher);
+            let mut rng_state = RandomState::new().hash_one(thread_id);
+
+            // Simple random number generator for reproducible randomness
+            let mut next_random = || -> u64 {
+                rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
+                rng_state
+            };
+
+            for iteration in 0..ITERATIONS_PER_THREAD {
+                // === START OF ITERATION ===
+                // Get a sandbox from the pool for this iteration
+                let sandbox_with_snapshot = loop {
+                    let mut pool_guard = pool_clone.lock().unwrap();
+                    if let Some(sb) = pool_guard.pop_front() {
+                        break sb;
+                    }
+                    // Pool is empty, release lock and wait
+                    drop(pool_guard);
+                    trace!(
+                        "[THREAD-{}] Iteration {}: Pool empty, waiting for sandbox...",
+                        thread_id, iteration
+                    );
+                    thread::sleep(Duration::from_millis(1));
+                };
+
+                // Use a guard struct to ensure sandbox is always returned to pool
+                struct SandboxGuard<'a> {
+                    sandbox_with_snapshot: Option<SandboxWithSnapshot>,
+                    pool: &'a Arc<Mutex<VecDeque<SandboxWithSnapshot>>>,
+                }
+
+                impl Drop for SandboxGuard<'_> {
+                    fn drop(&mut self) {
+                        if let Some(sb) = self.sandbox_with_snapshot.take() {
+                            let mut pool_guard = self.pool.lock().unwrap();
+                            pool_guard.push_back(sb);
+                            trace!(
+                                "[GUARD] Returned sandbox to pool, pool size now: {}",
+                                pool_guard.len()
+                            );
+                        }
+                    }
+                }
+
+                let mut guard = SandboxGuard {
+                    sandbox_with_snapshot: Some(sandbox_with_snapshot),
+                    pool: &pool_clone,
+                };
+
+                // Decide randomly: should we attempt to kill this call?
+                let should_kill = (next_random() as f64 / u64::MAX as f64) < KILL_PROBABILITY;
+
+                if should_kill {
+                    kill_attempted_count_clone.fetch_add(1, Ordering::Relaxed);
+                }
+
+                let sandbox_wrapper = guard.sandbox_with_snapshot.as_mut().unwrap();
+                let sandbox = &mut sandbox_wrapper.sandbox;
+                let interrupt_handle = sandbox.interrupt_handle();
+
+                // If we decided to kill, spawn a thread that will kill at a random time
+                // Use a barrier to ensure the killer thread waits until we're about to call the guest
+                let killer_thread = if should_kill {
+                    use std::sync::{Arc, Barrier};
+
+                    let barrier = Arc::new(Barrier::new(2));
+                    let barrier_clone = Arc::clone(&barrier);
+
+                    // Generate random delay here before moving into thread
+                    let kill_delay_ms = next_random() % 16;
+                    let thread_id_clone = thread_id;
+                    let iteration_clone = iteration;
+                    let handle = thread::spawn(move || {
+                        trace!(
+                            "[KILLER-{}-{}] Waiting at barrier...",
+                            thread_id_clone, iteration_clone
+                        );
+                        // Wait at the barrier until the main thread is ready to call the guest
+                        barrier_clone.wait();
+                        trace!(
+                            "[KILLER-{}-{}] Passed barrier, sleeping for {}ms...",
+                            thread_id_clone, iteration_clone, kill_delay_ms
+                        );
+                        // Random delay between 0 and 15ms (guest runs for ~10ms)
+                        thread::sleep(Duration::from_millis(kill_delay_ms));
+                        trace!(
+                            "[KILLER-{}-{}] Calling kill()...",
+                            thread_id_clone, iteration_clone
+                        );
+                        interrupt_handle.kill();
+                        trace!(
+                            "[KILLER-{}-{}] kill() returned, exiting thread",
+                            thread_id_clone, iteration_clone
+                        );
+                    });
+                    Some((handle, barrier))
+                } else {
+                    None
+                };
+
+                // Call the guest function
+                trace!(
+                    "[THREAD-{}] Iteration {}: Calling guest function (should_kill={})...",
+                    thread_id, iteration, should_kill
+                );
+
+                // Release the barrier just before calling the guest function
+                if let Some((_, ref barrier)) = killer_thread {
+                    trace!(
+                        "[THREAD-{}] Iteration {}: Main thread waiting at barrier...",
+                        thread_id, iteration
+                    );
+                    barrier.wait();
+                    trace!(
+                        "[THREAD-{}] Iteration {}: Main thread passed barrier, calling guest...",
+                        thread_id, iteration
+                    );
+                }
+
+                let result = sandbox.call::<u64>("SpinForMs", GUEST_CALL_DURATION_MS);
+                trace!(
+                    "[THREAD-{}] Iteration {}: Guest call returned: {:?}",
+                    thread_id,
+                    iteration,
+                    result
+                        .as_ref()
+                        .map(|_| "Ok")
+                        .map_err(|e| format!("{:?}", e))
+                );
+
+                // Wait for killer thread to finish if it was spawned
+                if let Some((kt, _)) = killer_thread {
+                    trace!(
+                        "[THREAD-{}] Iteration {}: Waiting for killer thread to join...",
+                        thread_id, iteration
+                    );
+                    let _ = kt.join();
+                }
+
+                // Process the result based on whether we attempted to kill
+                match result {
+                    Err(HyperlightError::ExecutionCanceledByHost()) => {
+                        // Restore the sandbox from the snapshot
+                        trace!(
+                            "[THREAD-{}] Iteration {}: Restoring sandbox from snapshot after ExecutionCanceledByHost...",
+                            thread_id, iteration
+                        );
+                        let sandbox_wrapper = guard.sandbox_with_snapshot.as_mut().unwrap();
+
+                        // Make sure the sandbox is poisoned
+                        assert!(sandbox_wrapper.sandbox.poisoned());
+
+                        // Try to restore the snapshot
+                        if let Err(e) = sandbox_wrapper.sandbox.restore(&sandbox_wrapper.snapshot) {
+                            error!(
+                                "CRITICAL: Thread {} iteration {}: Failed to restore snapshot: {:?}",
+                                thread_id, iteration, e
+                            );
+                            trace!(
+                                "[THREAD-{}] Iteration {}: Creating new sandbox to replace failed one...",
+                                thread_id, iteration
+                            );
+
+                            // Create a new sandbox with snapshot
+                            match new_uninit_rust().and_then(|uninit| uninit.evolve()) {
+                                Ok(mut new_sandbox) => {
+                                    match new_sandbox.snapshot() {
+                                        Ok(new_snapshot) => {
+                                            // Replace the failed sandbox with the new one
+                                            sandbox_wrapper.sandbox = new_sandbox;
+                                            sandbox_wrapper.snapshot = new_snapshot;
+                                            sandbox_replaced_count_clone
+                                                .fetch_add(1, Ordering::Relaxed);
+                                            trace!(
+                                                "[THREAD-{}] Iteration {}: Successfully replaced sandbox",
+                                                thread_id, iteration
+                                            );
+                                        }
+                                        Err(snapshot_err) => {
+                                            error!(
+                                                "CRITICAL: Thread {} iteration {}: Failed to create snapshot for new sandbox: {:?}",
+                                                thread_id, iteration, snapshot_err
+                                            );
+                                            // Still use the new sandbox even without snapshot
+                                            sandbox_wrapper.sandbox = new_sandbox;
+                                            sandbox_replaced_count_clone
+                                                .fetch_add(1, Ordering::Relaxed);
+                                        }
+                                    }
+                                }
+                                Err(create_err) => {
+                                    error!(
+                                        "CRITICAL: Thread {} iteration {}: Failed to create new sandbox: {:?}",
+                                        thread_id, iteration, create_err
+                                    );
+                                    // Continue with the broken sandbox - it will be removed from pool eventually
+                                }
+                            }
+                        }
+
+                        if should_kill {
+                            // We attempted to kill and it was cancelled - SUCCESS
+                            actually_killed_count_clone.fetch_add(1, Ordering::Relaxed);
+                        } else {
+                            // We did NOT attempt to kill but got cancelled - CRITICAL FAILURE
+                            unexpected_cancelled_clone.fetch_add(1, Ordering::Relaxed);
+                            error!(
+                                "CRITICAL: Thread {} iteration {}: Got ExecutionCanceledByHost but did NOT attempt kill!",
+                                thread_id, iteration
+                            );
+                        }
+                    }
+                    Ok(_) => {
+                        if should_kill {
+                            // We attempted to kill but it completed OK - acceptable race condition
+                            killed_but_completed_ok_clone.fetch_add(1, Ordering::Relaxed);
+                        } else {
+                            // We did NOT attempt to kill and it completed OK - EXPECTED
+                            not_killed_completed_ok_clone.fetch_add(1, Ordering::Relaxed);
+                        }
+                    }
+                    Err(_other_error) => {
+                        // Log the other error so we can see what it is
+                        error!(
+                            "Thread {} iteration {}: Got non-cancellation error: {:?}",
+                            thread_id, iteration, _other_error
+                        );
+                        if should_kill {
+                            // We attempted to kill and got some other error - acceptable
+                            killed_but_error_clone.fetch_add(1, Ordering::Relaxed);
+                        } else {
+                            // We did NOT attempt to kill and got some other error - acceptable
+                            not_killed_error_clone.fetch_add(1, Ordering::Relaxed);
+                        }
+                    }
+                }
+
+                total_iterations_clone.fetch_add(1, Ordering::Relaxed);
+
+                // Progress reporting
+                let current_total = total_iterations_clone.load(Ordering::Relaxed);
+                if current_total % 5000 == 0 {
+                    println!(
+                        "Progress: {}/{} iterations completed",
+                        current_total,
+                        NUM_THREADS * ITERATIONS_PER_THREAD
+                    );
+                }
+
+                // === END OF ITERATION ===
+                // SandboxGuard will automatically return sandbox to pool when it goes out of scope
+            }
+
+            trace!(
+                "[THREAD-{}] Completed all {} iterations!",
+                thread_id, ITERATIONS_PER_THREAD
+            );
+        });
+
+        thread_handles.push(handle);
+    }
+
+    trace!(
+        "All {} worker threads spawned, waiting for completion...",
+        NUM_THREADS
+    );
+
+    // Wait for all threads to complete
+    for (idx, handle) in thread_handles.into_iter().enumerate() {
+        trace!("Waiting for thread {} to join...", idx);
+        handle.join().unwrap();
+        trace!("Thread {} joined successfully", idx);
+    }
+
+    trace!("All threads joined successfully!");
+
+    // Collect final statistics
+    let total = total_iterations.load(Ordering::Relaxed);
+    let kill_attempted = kill_attempted_count.load(Ordering::Relaxed);
+    let actually_killed = actually_killed_count.load(Ordering::Relaxed);
+    let not_killed_ok = not_killed_completed_ok.load(Ordering::Relaxed);
+    let not_killed_err = not_killed_error.load(Ordering::Relaxed);
+    let killed_but_ok = killed_but_completed_ok.load(Ordering::Relaxed);
+    let killed_but_err = killed_but_error.load(Ordering::Relaxed);
+    let unexpected_cancel = unexpected_cancelled.load(Ordering::Relaxed);
+    let sandbox_replaced = sandbox_replaced_count.load(Ordering::Relaxed);
+
+    let no_kill_attempted = total - kill_attempted;
+
+    // Print detailed statistics
+    println!("\n=== Interrupt Random Kill Stress Test Statistics ===");
+    println!("Total iterations: {}", total);
+    println!();
+    println!(
+        "Kill Attempts: {} ({:.1}%)",
+        kill_attempted,
+        (kill_attempted as f64 / total as f64) * 100.0
+    );
+    println!(
+        "  - Actually killed (ExecutionCanceledByHost): {}",
+        actually_killed
+    );
+    println!("  - Completed OK despite kill attempt: {}", killed_but_ok);
+    println!(
+        "  - Error (non-cancelled) despite kill attempt: {}",
+        killed_but_err
+    );
+    if kill_attempted > 0 {
+        println!(
+            "  - Kill success rate: {:.1}%",
+            (actually_killed as f64 / kill_attempted as f64) * 100.0
+        );
+    }
+    println!();
+    println!(
+        "No Kill Attempts: {} ({:.1}%)",
+        no_kill_attempted,
+        (no_kill_attempted as f64 / total as f64) * 100.0
+    );
+    println!("  - Completed OK: {}", not_killed_ok);
+    println!("  - Error (non-cancelled): {}", not_killed_err);
+    println!(
+        "  - Cancelled (SHOULD BE 0): {} {}",
+        unexpected_cancel,
+        if unexpected_cancel == 0 {
+            "✅"
+        } else {
+            "❌ FAILURE"
+        }
+    );
+    println!();
+    println!("Sandbox Management:");
+    println!(
+        "  - Sandboxes replaced due to restore failure: {}",
+        sandbox_replaced
+    );
+
+    // CRITICAL VALIDATIONS
+    assert_eq!(
+        unexpected_cancel, 0,
+        "FAILURE: {} non-killed calls returned ExecutionCanceledByHost! This indicates false kills.",
+        unexpected_cancel
+    );
+
+    assert!(
+        actually_killed > 0,
+        "FAILURE: No calls were actually killed despite {} kill attempts!",
+        kill_attempted
+    );
+
+    assert!(
+        kill_attempted > 0,
+        "FAILURE: No kill attempts were made (expected ~50% of {} iterations)!",
+        total
+    );
+
+    assert!(
+        kill_attempted < total,
+        "FAILURE: All {} iterations were kill attempts (expected ~50%)!",
+        total
+    );
+
+    // Verify total accounting
+    assert_eq!(
+        total,
+        actually_killed
+            + not_killed_ok
+            + not_killed_err
+            + killed_but_ok
+            + killed_but_err
+            + unexpected_cancel,
+        "Iteration accounting mismatch!"
+    );
+
+    assert_eq!(
+        total,
+        NUM_THREADS * ITERATIONS_PER_THREAD,
+        "Not all iterations completed"
+    );
+
+    println!("\n✅ All validations passed!");
+}
diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs
index c3b17ad58..93c513b82 100644
--- a/src/tests/rust_guests/simpleguest/src/main.rs
+++ b/src/tests/rust_guests/simpleguest/src/main.rs
@@ -564,7 +564,44 @@ fn spin(_: &FunctionCall) -> Result<Vec<u8>> {
     Ok(get_flatbuffer_result(()))
 }
 
-#[hyperlight_guest_tracing::trace_function]
+/// Spins the CPU for approximately the specified number of milliseconds
+fn spin_for_ms(fc: &FunctionCall) -> Result<Vec<u8>> {
+    let milliseconds = if let ParameterValue::UInt(ms) = fc.parameters.clone().unwrap()[0].clone() {
+        ms
+    } else {
+        return Err(HyperlightGuestError::new(
+            ErrorCode::GuestFunctionParameterTypeMismatch,
+            "Expected UInt parameter".to_string(),
+        ));
+    };
+
+    // Simple busy-wait loop - not precise but good enough for testing
+    // Different iteration counts for debug vs release mode to ensure reasonable CPU usage
+    #[cfg(debug_assertions)]
+    // Debug mode - less optimized. The value 120,000 for iterations_per_ms was empirically chosen
+    // to achieve approximately a 50% "kill rate" in test runs, meaning that about half of the tests
+    // using this spin function will hit a timeout or resource limit imposed by the host. This helps
+    // stress-test the host's timeout and resource management logic. The value may need adjustment
+    // depending on the test environment or hardware.
+    let iterations_per_ms = 120_000;
+
+    #[cfg(not(debug_assertions))]
+    let iterations_per_ms = 1_000_000; // Release mode - highly optimized
+
+    let total_iterations = milliseconds * iterations_per_ms;
+
+    let mut counter: u64 = 0;
+    for _ in 0..total_iterations {
+        // Prevent the compiler from optimizing away the loop
+        counter = counter.wrapping_add(1);
+        core::hint::black_box(counter);
+    }
+
+    // Calculate the actual number of milliseconds spun for, based on the counter and iterations per ms
+    let ms_spun = counter / iterations_per_ms as u64;
+    Ok(get_flatbuffer_result(ms_spun))
+}
+
 fn test_abort(function_call: &FunctionCall) -> Result<Vec<u8>> {
     if let ParameterValue::Int(code) = function_call.parameters.clone().unwrap()[0].clone() {
         abort_with_code(&[code as u8]);
@@ -1215,6 +1252,14 @@ pub extern "C" fn hyperlight_main() {
     );
     register_function(spin_def);
 
+    let spin_for_ms_def = GuestFunctionDefinition::new(
+        "SpinForMs".to_string(),
+        Vec::from(&[ParameterType::UInt]),
+        ReturnType::ULong,
+        spin_for_ms as usize,
+    );
+    register_function(spin_for_ms_def);
+
     let abort_def = GuestFunctionDefinition::new(
         "GuestAbortWithCode".to_string(),
         Vec::from(&[ParameterType::Int]),