Prevent ABA-problem, where the vcpu could be successfully interrupted, but a new function call could be scheduled, before the interruptor-thread has time to observe the fact that the vcpu was interrupted

ludfjig · ludfjig · commit de5c32872c0c · 2025-06-03T13:26:23.000-07:00
Signed-off-by: Ludvig Liljenberg &lt;4257730+ludfjig@users.noreply.github.com&gt;
diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs
@@ -396,7 +396,7 @@ impl HypervLinuxDriver {
             entrypoint: entrypoint_ptr.absolute()?,
             orig_rsp: rsp_ptr,
             interrupt_handle: Arc::new(LinuxInterruptHandle {
-                running: AtomicBool::new(false),
+                running: AtomicU64::new(0),
                 cancel_requested: AtomicBool::new(false),
                 tid: AtomicU64::new(unsafe { libc::pthread_self() }),
                 retry_delay: config.get_interrupt_retry_delay(),
@@ -591,7 +591,14 @@ impl Hypervisor for HypervLinuxDriver {
             .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
         // Note: if a `InterruptHandle::kill()` called while this thread is **here**
         // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        self.interrupt_handle.running.store(true, Ordering::Relaxed);
+        self.interrupt_handle
+            .set_running_and_increment_generation()
+            .map_err(|e| {
+                new_error!(
+                    "Error setting running state and incrementing generation: {}",
+                    e
+                )
+            })?;
         // Don't run the vcpu if `cancel_requested` is true
         //
         // Note: if a `InterruptHandle::kill()` called while this thread is **here**
@@ -629,9 +636,7 @@ impl Hypervisor for HypervLinuxDriver {
         // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
         // Additionally signals will be sent to this thread until `running` is set to false.
         // This is fine since the signal handler is a no-op.
-        self.interrupt_handle
-            .running
-            .store(false, Ordering::Relaxed);
+        self.interrupt_handle.clear_running_bit();
         // At this point, `running` is false so no more signals will be sent to this thread,
         // but we may still receive async signals that were sent before this point.
         // To prevent those signals from interrupting subsequent calls to `run()`,
diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs
@@ -351,7 +351,7 @@ impl KVMDriver {
             orig_rsp: rsp_gp,
             mem_regions,
             interrupt_handle: Arc::new(LinuxInterruptHandle {
-                running: AtomicBool::new(false),
+                running: AtomicU64::new(0),
                 cancel_requested: AtomicBool::new(false),
                 tid: AtomicU64::new(unsafe { libc::pthread_self() }),
                 retry_delay: config.get_interrupt_retry_delay(),
@@ -526,7 +526,14 @@ impl Hypervisor for KVMDriver {
             .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
         // Note: if a `InterruptHandle::kill()` called while this thread is **here**
         // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        self.interrupt_handle.running.store(true, Ordering::Relaxed);
+        self.interrupt_handle
+            .set_running_and_increment_generation()
+            .map_err(|e| {
+                new_error!(
+                    "Error setting running state and incrementing generation: {}",
+                    e
+                )
+            })?;
         // Don't run the vcpu if `cancel_requested` is true
         //
         // Note: if a `InterruptHandle::kill()` called while this thread is **here**
@@ -558,9 +565,7 @@ impl Hypervisor for KVMDriver {
         // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
         // Additionally signals will be sent to this thread until `running` is set to false.
         // This is fine since the signal handler is a no-op.
-        self.interrupt_handle
-            .running
-            .store(false, Ordering::Relaxed);
+        self.interrupt_handle.clear_running_bit();
         // At this point, `running` is false so no more signals will be sent to this thread,
         // but we may still receive async signals that were sent before this point.
         // To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs
@@ -338,8 +338,23 @@ pub trait InterruptHandle: Send + Sync {
 #[cfg(any(kvm, mshv))]
 #[derive(Debug)]
 pub(super) struct LinuxInterruptHandle {
-    /// Invariant: vcpu is running => `running` is true. (Neither converse nor inverse is true)
-    running: AtomicBool,
+    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
+    ///
+    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
+    ///
+    /// This prevents an ABA problem where:
+    /// 1. The VCPU is running (generation N),
+    /// 2. It gets cancelled,
+    /// 3. Then quickly restarted (generation N+1),
+    ///     before the original thread has observed that it was cancelled.
+    ///
+    /// Without this generation counter, the interrupt logic might assume the VCPU is still
+    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
+    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
+    /// potentially causing an infinite loop where no effective interrupt is delivered.
+    ///
+    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
+    running: AtomicU64,
     /// Invariant: vcpu is running => `tid` is the thread on which it is running.
     /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
     tid: AtomicU64,
@@ -359,15 +374,61 @@ pub(super) struct LinuxInterruptHandle {
     sig_rt_min_offset: u8,
 }
 
+#[cfg(any(kvm, mshv))]
+impl LinuxInterruptHandle {
+    const RUNNING_BIT: u64 = 1 << 63;
+    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
+
+    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
+    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
+        self.running
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
+                let generation = raw & !Self::RUNNING_BIT;
+                if generation == Self::MAX_GENERATION {
+                    // restart generation from 0
+                    return Some(Self::RUNNING_BIT);
+                }
+                Some((generation + 1) | Self::RUNNING_BIT)
+            })
+    }
+
+    // clear the running bit and return the generation
+    fn clear_running_bit(&self) -> u64 {
+        self.running
+            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
+    }
+
+    fn get_running_and_generation(&self) -> (bool, u64) {
+        let raw = self.running.load(Ordering::Relaxed);
+        let running = raw & Self::RUNNING_BIT != 0;
+        let generation = raw & !Self::RUNNING_BIT;
+        (running, generation)
+    }
+}
+
 #[cfg(any(kvm, mshv))]
 impl InterruptHandle for LinuxInterruptHandle {
     fn kill(&self) -> bool {
         self.cancel_requested.store(true, Ordering::Relaxed);
 
         let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
         let mut sent_signal = false;
+        let mut target_generation: Option<u64> = None;
+
+        loop {
+            let (running, generation) = self.get_running_and_generation();
+
+            if !running {
+                break;
+            }
+
+            match target_generation {
+                None => target_generation = Some(generation),
+                // prevent ABA problem
+                Some(expected) if expected != generation => break,
+                _ => {}
+            }
 
-        while self.running.load(Ordering::Relaxed) {
             log::info!("Sending signal to kill vcpu thread...");
             sent_signal = true;
             unsafe {
diff --git a/src/hyperlight_host/tests/integration_test.rs b/src/hyperlight_host/tests/integration_test.rs
@@ -283,10 +283,14 @@ fn interrupt_moved_sandbox() {
     thread2.join().expect("Thread should finish");
 }
 
+/// This tests exercises the behavior of killing vcpu with a long retry delay.
+/// This will exercise the ABA-problem, where the vcpu could be successfully interrupted,
+/// but restarted, before the interruptor-thread has a chance to see that the vcpu was killed.
+///
+/// The ABA-problem is solved by introducing run-generation on the vcpu.
 #[test]
 #[cfg(target_os = "linux")]
 fn interrupt_custom_signal_no_and_retry_delay() {
-    env_logger::builder().filter_level(LevelFilter::Info).init();
     let mut config = SandboxConfiguration::default();
     config.set_interrupt_vcpu_sigrtmin_offset(0).unwrap();
     config.set_interrupt_retry_delay(Duration::from_secs(1));
@@ -301,26 +305,24 @@ fn interrupt_custom_signal_no_and_retry_delay() {
 
     let interrupt_handle = sbox1.interrupt_handle();
     assert!(!interrupt_handle.dropped()); // not yet dropped
-    let barrier = Arc::new(Barrier::new(2));
-    let barrier2 = barrier.clone();
 
     const NUM_ITERS: usize = 3;
 
     let thread = thread::spawn(move || {
         for _ in 0..NUM_ITERS {
-            barrier2.wait();
             // wait for the guest call to start
             thread::sleep(Duration::from_millis(1000));
             interrupt_handle.kill();
         }
     });
 
     for _ in 0..NUM_ITERS {
-        barrier.wait();
         let res = sbox1
             .call_guest_function_by_name::<i32>("Spin", ())
             .unwrap_err();
         assert!(matches!(res, HyperlightError::ExecutionCanceledByHost()));
+        // immediately reenter another guest function call after having being cancelled,
+        // so that the vcpu is running again before the interruptor-thread has a chance to see that the vcpu is not running
     }
     thread.join().expect("Thread should finish");
 }