Skip to content

patch(main): Live Migration Support #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 68 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
cfed8de
feat: Add live migration and PVM support
pojntfx Aug 31, 2024
13bed1c
refactor: Drop PVM-specific changes
pojntfx Aug 31, 2024
d4dd430
Merge remote-tracking branch 'origin/main' into main-live-migration-pvm
pojntfx Sep 13, 2024
2702dcd
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Sep 13, 2024
be76c9b
Merge branch 'main' into main-live-migration-pvm
pojntfx Sep 19, 2024
1d74d63
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Sep 19, 2024
af2a30f
Merge branch 'main' into main-live-migration
ShivanshVij Oct 9, 2024
36dadb7
Merge branch 'main' into main-live-migration-pvm
ShivanshVij Oct 9, 2024
350ea99
Merge branch 'main' into main-live-migration
ShivanshVij Nov 28, 2024
cf7c0f6
Merge branch 'main' into main-live-migration-pvm
ShivanshVij Nov 28, 2024
072bd40
build: Switch from from-scratch builds to using AWS-vendored build im…
pojntfx Nov 28, 2024
e65e30f
build: Stage distro-independent binaries
pojntfx Nov 28, 2024
29497ec
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Nov 28, 2024
ff49ea8
refactor: Disable TSC scaling for PVM
pojntfx Dec 4, 2024
ac0927f
Merge branch 'main' into main-live-migration-pvm
ShivanshVij Dec 19, 2024
fbbb92c
Merge branch 'main' into main-live-migration
ShivanshVij Dec 19, 2024
82fe941
Merge commit '80b4cb4491273c76ec7c93746a36da9c7edd2388' into main-liv…
pojntfx Feb 4, 2025
e42e0bf
Merge commit '7b01bdc1e44d5eb4f62aa89c2a79a05331a7ee1e' into main-liv…
pojntfx Feb 4, 2025
6a8d274
Merge commit 'c18254467b695a71631d9896df13dcf3a9fd4db8' into main-liv…
pojntfx Feb 4, 2025
a4bc4c3
Merge commit '3afaf949ed220570cafe0ba84ce7580fe469884b' into main-liv…
pojntfx Feb 4, 2025
00df8ef
Merge commit '9cf1e6deeacfe2e6f0b85a501213cbbdb4434698' into main-liv…
pojntfx Feb 4, 2025
33ee81c
Merge commit '3793b9949774cf1d11abb2ef664664b8f40958a0' into main-liv…
pojntfx Feb 4, 2025
725c1cc
Merge commit '3fb06e940d8ad0508c31984cf0f8cb947282430b' into main-liv…
pojntfx Feb 4, 2025
cb42196
Merge commit '0efae509cc815774934e67f658bad143841f7cdd' into main-liv…
pojntfx Feb 4, 2025
c72c6a2
Merge commit '525e68639d18aa59820380179774a5489495f60a' into main-liv…
pojntfx Feb 4, 2025
75ac76c
Merge commit '43247e4226c4c7a9414f35e7bdbdf976b1de15b4' into main-liv…
pojntfx Feb 4, 2025
4e3b198
Merge commit '1bb9d1846b9c808e0a51f0a9bb773d20ba95fe69' into main-liv…
pojntfx Feb 4, 2025
56c0282
Merge remote-tracking branch 'origin/main' into main-live-migration-pvm
pojntfx Feb 4, 2025
5974b3b
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Feb 4, 2025
6a7433e
fix: Bump build image
pojntfx Feb 4, 2025
65154d7
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Feb 4, 2025
585de05
Merge branch 'main' into main-live-migration
pojntfx Feb 11, 2025
9b84c1d
Merge branch 'main' into main-live-migration-pvm
pojntfx Feb 11, 2025
3150ded
Adds upload of release builds to S3 (#17)
SuperManifolds Feb 12, 2025
4499b7f
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Feb 12, 2025
c42f8ef
Merge branch 'main' into main-live-migration
ShivanshVij Feb 20, 2025
9e61617
Merge branch 'main' into main-live-migration-pvm
ShivanshVij Feb 20, 2025
ef1eed8
Merge branch 'main' into main-live-migration
ShivanshVij Feb 25, 2025
5095e19
Merge branch 'main' into main-live-migration-pvm
ShivanshVij Feb 25, 2025
6e58971
Merge branch 'main' into main-live-migration
pojntfx Feb 26, 2025
1045bd9
Merge branch 'main' into main-live-migration-pvm
pojntfx Feb 26, 2025
2022804
ci: reduce IAM session time on publish (#18)
lgfa29 Mar 8, 2025
3ae6d09
Merge remote-tracking branch 'origin/main-live-migration-pvm' into ma…
pojntfx Mar 8, 2025
7083e95
Merge branch 'main' into main-live-migration
ShivanshVij Mar 10, 2025
bcb01ee
Merge branch 'main' into main-live-migration-pvm
ShivanshVij Mar 10, 2025
6d14fcf
Merge branch 'main' into main-live-migration
ShivanshVij Mar 14, 2025
512244d
fix: fixing potential vsock restore bug
ShivanshVij Mar 26, 2025
470483b
fix: synchronize internal queue indices with restored memory state
ShivanshVij Apr 6, 2025
b68fa01
fix: synchronize internal queue indices with restored memory state
ShivanshVij Apr 6, 2025
f60a149
Revert b68fa0123456d and 470483b77fd7c
lgfa29 Apr 8, 2025
c5daed9
revert: adding explicit queue sync
ShivanshVij Apr 8, 2025
0909d75
ci: set build metadata in version (#19)
lgfa29 Apr 9, 2025
6764ee4
ci: set build metadata in version (#20)
lgfa29 Apr 9, 2025
c34cbc7
Merge remote-tracking branch 'origin/main' into main-live-migration-pvm
pojntfx Apr 25, 2025
95b0e4e
Merge remote-tracking branch 'origin/main-live-migration-pvm' into ma…
pojntfx Apr 25, 2025
dad7b7a
Merge branch 'main' into main-live-migration
pojntfx May 1, 2025
f5708e2
Merge branch 'main' into main-live-migration-pvm
pojntfx May 1, 2025
83b41d0
Merge branch 'main' into main-live-migration
ShivanshVij May 4, 2025
0759320
Merge branch 'main' into main-live-migration-pvm
ShivanshVij May 4, 2025
d4290b9
Merge branch 'main' into main-live-migration
ShivanshVij May 14, 2025
709006a
Merge branch 'main' into main-live-migration-pvm
ShivanshVij May 14, 2025
40b2f3a
Merge branch 'main' into main-live-migration
pojntfx May 22, 2025
6c72dbe
Merge branch 'main' into main-live-migration-pvm
pojntfx May 22, 2025
3f2535b
Merge remote-tracking branch 'origin/main' into main-live-migration-pvm
pojntfx Jun 22, 2025
8e05c51
refactor: Clarify disabled CPU model check and re-enable queue checks
pojntfx Jun 22, 2025
2c8b9db
chore: Drop unnecessary formatting changes in queue
pojntfx Jun 22, 2025
2196c08
Merge branch 'main-live-migration-pvm' into main-live-migration
pojntfx Jun 22, 2025
32be42e
Merge branch 'main' into main-live-migration
ShivanshVij Jul 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions .github/workflows/hydrun.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
name: hydrun CI

on:
push:
pull_request:
schedule:
- cron: "0 0 * * 0"

jobs:
build-linux:
runs-on: ${{ matrix.target.runner }}
permissions:
contents: read
strategy:
matrix:
target:
# Binaries
- id: rust.x86_64
src: .
os: public.ecr.aws/firecracker/fcuvm:v77
flags: ""
cmd: ./Hydrunfile rust x86_64 loopholelabs
dst: out/*
runner: depot-ubuntu-22.04-32
- id: rust.aarch64
src: .
os: public.ecr.aws/firecracker/fcuvm:v77
flags: ""
cmd: ./Hydrunfile rust aarch64 loopholelabs
dst: out/*
runner: depot-ubuntu-22.04-arm-32

steps:
- name: Checkout
uses: actions/checkout@v4
- name: Restore ccache
uses: actions/cache/restore@v4
with:
path: |
/tmp/ccache
key: cache-ccache-${{ matrix.target.id }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Set up hydrun
run: |
curl -L -o /tmp/hydrun "https://github.com/pojntfx/hydrun/releases/latest/download/hydrun.linux-$(uname -m)"
sudo install /tmp/hydrun /usr/local/bin
- name: Build with hydrun
working-directory: ${{ matrix.target.src }}
run: hydrun -o ${{ matrix.target.os }} ${{ matrix.target.flags }} "${{ matrix.target.cmd }}"
- name: Fix permissions for output
run: sudo chown -R $USER .
- name: Save ccache
uses: actions/cache/save@v4
with:
path: |
/tmp/ccache
key: cache-ccache-${{ matrix.target.id }}
- name: Upload output
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.target.id }}
path: ${{ matrix.target.dst }}

publish-linux:
runs-on: ubuntu-latest
permissions:
id-token: write
contents: write
needs: build-linux

steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download output
uses: actions/download-artifact@v4
with:
path: /tmp/out
- name: Publish pre-release to GitHub releases
if: ${{ github.ref == 'refs/heads/main-live-migration-pvm' || github.ref == 'refs/heads/main-live-migration' || github.ref == 'refs/heads/firecracker-v1.8-live-migration-pvm' || github.ref == 'refs/heads/firecracker-v1.8-live-migration' }}
uses: softprops/action-gh-release@v2
with:
tag_name: release-${{ github.ref_name }}
prerelease: true
files: |
/tmp/out/*/*
- name: Publish release to GitHub releases
if: startsWith(github.ref, 'refs/tags/v')
uses: softprops/action-gh-release@v2
with:
prerelease: false
files: |
/tmp/out/*/*
- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@v4"
with:
aws-region: "${{ vars.AWS_REGION }}"
role-to-assume: "${{ vars.AWS_IAM_ROLE }}"
role-session-name: "firecracker-hydrun-${{ github.job }}-${{ github.run_id }}"
role-duration-seconds: 900

- name: Upload to S3
if: "!startsWith(github.ref, 'refs/pull/')"
run: |
if [[ "${{ github.ref }}" == refs/tags/* ]]; then
UPLOAD_FOLDER="release/${GITHUB_REF#refs/tags/}"
elif [[ "${{ github.ref }}" == refs/heads/* ]]; then
UPLOAD_FOLDER="dev/${GITHUB_REF#refs/heads/}"
else
echo "Skipping S3 upload: unsupported ref type $GITHUB_REF"
exit 0
fi
echo "Uploading artifacts to: ${{ vars.S3_BUCKET_URL }}firecracker/${UPLOAD_FOLDER}/"
aws s3 cp /tmp/out ${{ vars.S3_BUCKET_URL }}firecracker/${UPLOAD_FOLDER}/ --recursive
UPLOAD_FOLDER="dev/sha/${GITHUB_SHA}"
echo "Uploading artifacts to: ${{ vars.S3_BUCKET_URL }}firecracker/${UPLOAD_FOLDER}/"
aws s3 cp /tmp/out ${{ vars.S3_BUCKET_URL }}firecracker/${UPLOAD_FOLDER}/ --recursive
30 changes: 30 additions & 0 deletions Hydrunfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

set -e

# Rust
if [ "$1" = "rust" ]; then
# Configure Git
git config --global --add safe.directory '*'

# Set build metadata in version
current_version=$(cat src/firecracker/Cargo.toml | grep '^version =' | cut -d'"' -f 2)
git_sha=$(git rev-parse HEAD | head -c 12)
./tools/bump-version.sh "$current_version+$3.$git_sha"

# Build
export RUSTFLAGS='-C target-feature=+crt-static'
cargo build --package firecracker --package jailer --package seccompiler --package rebase-snap --package cpu-template-helper --target "$2-unknown-linux-musl" --all-features --release

# Stage binaries
mkdir -p out

dir="./build/cargo_target/$2-unknown-linux-musl/release"
for file in $(ls "$dir"); do
if [[ -x "$dir/$file" && ! -d "$dir/$file" ]]; then
cp "$dir/$file" "./out/${file}.linux-$2"
fi
done

exit 0
fi
4 changes: 4 additions & 0 deletions resources/seccomp/aarch64-unknown-linux-musl.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
{
"syscall": "fsync"
},
{
"syscall": "msync",
"comment": "Used for live migration to sync dirty pages"
},
{
"syscall": "close"
},
Expand Down
4 changes: 4 additions & 0 deletions resources/seccomp/x86_64-unknown-linux-musl.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
{
"syscall": "fsync"
},
{
"syscall": "msync",
"comment": "Used for live migration to sync dirty pages"
},
{
"syscall": "close"
},
Expand Down
8 changes: 8 additions & 0 deletions src/firecracker/src/api_server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ impl ApiServer {
&METRICS.latencies_us.diff_create_snapshot,
"create diff snapshot",
)),
SnapshotType::Msync => Some((
&METRICS.latencies_us.diff_create_snapshot,
"memory synchronization snapshot",
)),
SnapshotType::MsyncAndState => Some((
&METRICS.latencies_us.diff_create_snapshot,
"memory synchronization and state snapshot",
)),
},
VmmAction::LoadSnapshot(_) => {
Some((&METRICS.latencies_us.load_snapshot, "load snapshot"))
Expand Down
1 change: 1 addition & 0 deletions src/firecracker/src/api_server/request/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ fn parse_put_snapshot_load(body: &Body) -> Result<ParsedRequest, RequestError> {
|| snapshot_config.track_dirty_pages,
resume_vm: snapshot_config.resume_vm,
network_overrides: snapshot_config.network_overrides,
shared: snapshot_config.shared,
};

// Construct the `ParsedRequest` object.
Expand Down
7 changes: 7 additions & 0 deletions src/firecracker/swagger/firecracker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1218,6 +1218,8 @@ definitions:
enum:
- Full
- Diff
- Msync
- MsyncAndState
description:
Type of snapshot to create. It is optional and by default, a full
snapshot is created.
Expand Down Expand Up @@ -1280,6 +1282,11 @@ definitions:
description: Network host device names to override
items:
$ref: "#/definitions/NetworkOverride"
shared:
type: boolean
description: When set to true and the guest memory backend is a file,
changes to the memory are asynchronously written back to the
backend as the VM is running.


TokenBucket:
Expand Down
8 changes: 6 additions & 2 deletions src/vmm/src/arch/x86_64/vcpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -629,8 +629,12 @@ impl KvmVcpu {
}

/// Scale the TSC frequency of this vCPU to the one provided as a parameter.
pub fn set_tsc_khz(&self, tsc_freq: u32) -> Result<(), SetTscError> {
self.fd.set_tsc_khz(tsc_freq).map_err(SetTscError)
pub fn set_tsc_khz(&self, _: u32) -> Result<(), SetTscError> {
// Disable TSC scaling when using PVM because it is unsupported on most virtualized platforms.
// Even on supported platforms like virtualized AMD CPUs, enabling TSC scaling can cause VM freezes
// after resuming from a snapshot.
// For more details, see https://github.com/virt-pvm/linux/issues/12#issue-2515360332
Ok(())
}

/// Use provided state to populate KVM internal state.
Expand Down
3 changes: 2 additions & 1 deletion src/vmm/src/cpu_config/x86_64/custom_cpu_template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ impl GetCpuTemplate for Option<CpuTemplateType> {

let cpu_model = CpuModel::get_cpu_model();
if !template.get_supported_cpu_models().contains(&cpu_model) {
return Err(InvalidCpuModel);
// Disable the CPU compatibility to check to allow using templates like T2A on more modern CPUs
// return Err(InvalidCpuModel);
}

match template {
Expand Down
6 changes: 6 additions & 0 deletions src/vmm/src/logger/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -609,13 +609,19 @@ pub struct PerformanceMetrics {
pub vmm_pause_vm: SharedStoreMetric,
/// Measures the microVM resuming duration, at the VMM level, in microseconds.
pub vmm_resume_vm: SharedStoreMetric,
/// Measures the snapshot memory synchronization time, at the VMM level, in microseconds.
pub msync_create_snapshot: SharedStoreMetric,
/// Measures the snapshot memory synchronization and state time, at the VMM level, in microseconds.
pub msync_and_state_create_snapshot: SharedStoreMetric,
}
impl PerformanceMetrics {
/// Const default construction.
pub const fn new() -> Self {
Self {
full_create_snapshot: SharedStoreMetric::new(),
diff_create_snapshot: SharedStoreMetric::new(),
msync_create_snapshot: SharedStoreMetric::new(),
msync_and_state_create_snapshot: SharedStoreMetric::new(),
load_snapshot: SharedStoreMetric::new(),
pause_vm: SharedStoreMetric::new(),
resume_vm: SharedStoreMetric::new(),
Expand Down
49 changes: 37 additions & 12 deletions src/vmm/src/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ use crate::utils::u64_to_usize;
use crate::vmm_config::boot_source::BootSourceConfig;
use crate::vmm_config::instance_info::InstanceInfo;
use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigError, MachineConfigUpdate};
use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, MemBackendType};
use crate::vmm_config::snapshot::{
CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType,
};
use crate::vstate::kvm::KvmState;
use crate::vstate::memory;
use crate::vstate::memory::{GuestMemoryState, GuestRegionMmap, MemoryError};
Expand Down Expand Up @@ -137,6 +139,8 @@ pub enum CreateSnapshotError {
DirtyBitmap(#[from] VmError),
/// Cannot write memory file: {0}
Memory(#[from] MemoryError),
/// Cannot msync memory file: {0}
MemoryMsync(MemoryError),
/// Cannot perform {0} on the memory backing file: {1}
MemoryBackingFile(&'static str, io::Error),
/// Cannot save the microVM state: {0}
Expand All @@ -156,11 +160,16 @@ pub fn create_snapshot(
vm_info: &VmInfo,
params: &CreateSnapshotParams,
) -> Result<(), CreateSnapshotError> {
let microvm_state = vmm
.save_state(vm_info)
.map_err(CreateSnapshotError::MicrovmState)?;
match params.snapshot_type {
SnapshotType::Diff | SnapshotType::Full | SnapshotType::MsyncAndState => {
let microvm_state = vmm
.save_state(vm_info)
.map_err(CreateSnapshotError::MicrovmState)?;

snapshot_state_to_file(&microvm_state, &params.snapshot_path)?;
snapshot_state_to_file(&microvm_state, &params.snapshot_path)?;
}
SnapshotType::Msync => (),
}

vmm.vm
.snapshot_memory_to_file(&params.mem_file_path, params.snapshot_type)?;
Expand Down Expand Up @@ -384,8 +393,13 @@ pub fn restore_from_snapshot(
.into());
}
(
guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)
.map_err(RestoreFromSnapshotGuestMemoryError::File)?,
guest_memory_from_file(
mem_backend_path,
mem_state,
track_dirty_pages,
params.shared,
)
.map_err(RestoreFromSnapshotGuestMemoryError::File)?,
None,
)
}
Expand Down Expand Up @@ -415,7 +429,7 @@ pub enum SnapshotStateFromFileError {
/// Failed to open snapshot file: {0}
Open(std::io::Error),
/// Failed to read snapshot file metadata: {0}
Meta(std::io::Error),
Meta(crate::snapshot::SnapshotError),
/// Failed to load snapshot state from file: {0}
Load(#[from] crate::snapshot::SnapshotError),
/// Unknown Network Device.
Expand All @@ -428,8 +442,9 @@ fn snapshot_state_from_file(
let snapshot = Snapshot::new(SNAPSHOT_VERSION);
let mut snapshot_reader =
File::open(snapshot_path).map_err(SnapshotStateFromFileError::Open)?;
let metadata = std::fs::metadata(snapshot_path).map_err(SnapshotStateFromFileError::Meta)?;
let snapshot_len = u64_to_usize(metadata.len());
let raw_snapshot_len: u64 =
Snapshot::deserialize(&mut snapshot_reader).map_err(SnapshotStateFromFileError::Meta)?;
let snapshot_len = u64_to_usize(raw_snapshot_len);
let state: MicrovmState = snapshot
.load_with_version_check(&mut snapshot_reader, snapshot_len)
.map_err(SnapshotStateFromFileError::Load)?;
Expand All @@ -451,9 +466,19 @@ fn guest_memory_from_file(
mem_file_path: &Path,
mem_state: &GuestMemoryState,
track_dirty_pages: bool,
shared: bool,
) -> Result<Vec<GuestRegionMmap>, GuestMemoryFromFileError> {
let mem_file = File::open(mem_file_path)?;
let guest_mem = memory::snapshot_file(mem_file, mem_state.regions(), track_dirty_pages)?;
let mem_file = if shared {
OpenOptions::new()
.read(true)
.write(true)
.open(mem_file_path)?
} else {
File::open(mem_file_path)?
};

let guest_mem =
memory::snapshot_file(mem_file, mem_state.regions(), track_dirty_pages, shared)?;
Ok(guest_mem)
}

Expand Down
20 changes: 20 additions & 0 deletions src/vmm/src/rpc_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,26 @@ impl RuntimeApiController {
elapsed_time_us
);
}
SnapshotType::Msync => {
let elapsed_time_us = update_metric_with_elapsed_time(
&METRICS.latencies_us.msync_create_snapshot,
create_start_us,
);
info!(
"'create memory synchronization snapshot' VMM action took {} us.",
elapsed_time_us
);
}
SnapshotType::MsyncAndState => {
let elapsed_time_us = update_metric_with_elapsed_time(
&METRICS.latencies_us.msync_and_state_create_snapshot,
create_start_us,
);
info!(
"'create memory synchronization and state snapshot' VMM action took {} us.",
elapsed_time_us
);
}
}
Ok(VmmData::Empty)
}
Expand Down
Loading
Loading