diff --git a/openhcl/openhcl_boot/src/arch/x86_64/memory.rs b/openhcl/openhcl_boot/src/arch/x86_64/memory.rs index bcf72c41d5..3f9ee90ced 100644 --- a/openhcl/openhcl_boot/src/arch/x86_64/memory.rs +++ b/openhcl/openhcl_boot/src/arch/x86_64/memory.rs @@ -121,10 +121,19 @@ pub fn setup_vtl2_memory(shim_params: &ShimParams, partition_info: &PartitionInf } } - // For TDVMCALL based hypercalls, take the first 2 MB region from ram_buffer for - // hypercall IO pages. ram_buffer must not be used again beyond this point - // TODO: find an approach that does not require re-using the ram_buffer if shim_params.isolation_type == IsolationType::Tdx { + // Update the VP context stored in the page of the architectural + // reset vector, such that TDX APs start with the appropriate + // page table and execution controls + crate::arch::tdx::tdx_prepare_ap_trampoline( + shim_params + .ap_page_tables + .expect("AP page tables must be provided for TDX mailbox boot") + .start(), + ); + + // For TDVMCALL based hypercalls, take the first 2 MB region from ram_buffer for + // hypercall IO pages. ram_buffer must not be used again beyond this point let free_buffer = ram_buffer.as_mut_ptr() as u64; assert!(free_buffer % X64_LARGE_PAGE_SIZE == 0); // SAFETY: The bottom 2MB region of the ram_buffer is unused by the shim diff --git a/openhcl/openhcl_boot/src/arch/x86_64/tdx.rs b/openhcl/openhcl_boot/src/arch/x86_64/tdx.rs index df4e0247b6..ea32ea8fc4 100644 --- a/openhcl/openhcl_boot/src/arch/x86_64/tdx.rs +++ b/openhcl/openhcl_boot/src/arch/x86_64/tdx.rs @@ -182,7 +182,7 @@ pub fn get_tdx_tsc_reftime() -> Option { /// Update the TdxTrampolineContext, setting the necessary control registers for AP startup, /// and ensuring that LGDT will be skipped, so the GDT page does not need to be added to the /// e820 entries -pub fn tdx_prepare_ap_trampoline() { +pub fn tdx_prepare_ap_trampoline(cr3: u64) { let context_ptr: *mut TdxTrampolineContext = RESET_VECTOR_PAGE as *mut TdxTrampolineContext; // SAFETY: The TdxTrampolineContext is known to be stored at the architectural reset vector address let tdxcontext: &mut TdxTrampolineContext = unsafe { context_ptr.as_mut().unwrap() }; @@ -191,13 +191,11 @@ pub fn tdx_prepare_ap_trampoline() { tdxcontext.code_selector = 0; tdxcontext.task_selector = 0; tdxcontext.cr0 |= x86defs::X64_CR0_PG | x86defs::X64_CR0_PE | x86defs::X64_CR0_NE; + tdxcontext.cr3 = cr3; tdxcontext.cr4 |= x86defs::X64_CR4_PAE | x86defs::X64_CR4_MCE; } pub fn setup_vtl2_vp(partition_info: &PartitionInfo) { - // Update the TDX Trampoline Context for AP Startup - tdx_prepare_ap_trampoline(); - for cpu in 1..partition_info.cpus.len() { hvcall() .tdx_enable_vp_vtl2(cpu as u32) diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index f8caff857a..43e662ff14 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -472,13 +472,13 @@ impl PartitionInfo { )); } - // Only specify pagetables as a reserved region on TDX, as they are used + // Only specify ap pagetables as a reserved region on TDX, as they are used // for AP startup via the mailbox protocol. On other platforms, the // memory is free to be reclaimed. if params.isolation_type == IsolationType::Tdx { - assert!(params.page_tables.is_some()); + assert!(params.ap_page_tables.is_some()); address_space_builder = address_space_builder - .with_page_tables(params.page_tables.expect("always present on tdx")); + .with_page_tables(params.ap_page_tables.expect("always present on tdx")); } address_space_builder diff --git a/openhcl/openhcl_boot/src/host_params/shim_params.rs b/openhcl/openhcl_boot/src/host_params/shim_params.rs index 3c360103dd..158d8904dc 100644 --- a/openhcl/openhcl_boot/src/host_params/shim_params.rs +++ b/openhcl/openhcl_boot/src/host_params/shim_params.rs @@ -101,8 +101,8 @@ pub struct ShimParams { /// Memory used by the shim. pub used: MemoryRange, pub bounce_buffer: Option, - /// Page tables region used by the shim. - pub page_tables: Option, + /// AP page tables region used by the shim. + pub ap_page_tables: Option, } impl ShimParams { @@ -129,8 +129,8 @@ impl ShimParams { used_end, bounce_buffer_start, bounce_buffer_size, - page_tables_start, - page_tables_size, + ap_page_tables_start, + ap_page_tables_size, } = raw; let isolation_type = get_isolation_type(supported_isolation_type); @@ -142,11 +142,11 @@ impl ShimParams { Some(MemoryRange::new(base..base + bounce_buffer_size)) }; - let page_tables = if page_tables_size == 0 { + let ap_page_tables = if ap_page_tables_size == 0 { None } else { - let base = shim_base_address.wrapping_add_signed(page_tables_start); - Some(MemoryRange::new(base..base + page_tables_size)) + let base = shim_base_address.wrapping_add_signed(ap_page_tables_start); + Some(MemoryRange::new(base..base + ap_page_tables_size)) }; Self { @@ -171,7 +171,7 @@ impl ShimParams { ..shim_base_address.wrapping_add_signed(used_end), ), bounce_buffer, - page_tables, + ap_page_tables, } } diff --git a/vm/loader/loader_defs/src/shim.rs b/vm/loader/loader_defs/src/shim.rs index 4170bab3dc..a82de512f7 100644 --- a/vm/loader/loader_defs/src/shim.rs +++ b/vm/loader/loader_defs/src/shim.rs @@ -53,10 +53,10 @@ pub struct ShimParamsRaw { pub bounce_buffer_start: i64, /// The size of the bounce buffer range. This is 0 if unavailable. pub bounce_buffer_size: u64, - /// The offset to the page_tables start address. This is 0 if unavailable. - pub page_tables_start: i64, - /// The size of the openhcl_boot page tables. This is 0 if unavailable. - pub page_tables_size: u64, + /// The offset to the ap_page_tables start address. This is 0 if unavailable. + pub ap_page_tables_start: i64, + /// The size of the openhcl_boot ap page tables. This is 0 if unavailable. + pub ap_page_tables_size: u64, } open_enum! { diff --git a/vm/loader/manifests/openhcl-x64-cvm-dev.json b/vm/loader/manifests/openhcl-x64-cvm-dev.json index d5eef86cd0..8a9a59c52c 100644 --- a/vm/loader/manifests/openhcl-x64-cvm-dev.json +++ b/vm/loader/manifests/openhcl-x64-cvm-dev.json @@ -57,4 +57,4 @@ } } ] -} \ No newline at end of file +} diff --git a/vm/loader/page_table/src/x64.rs b/vm/loader/page_table/src/x64.rs index bbf13e7aaf..24aaafc4e4 100644 --- a/vm/loader/page_table/src/x64.rs +++ b/vm/loader/page_table/src/x64.rs @@ -130,9 +130,13 @@ impl PageTableEntry { const VALID_BITS: u64 = 0x000f_ffff_ffff_f000; /// Set an AMD64 PDE to either represent a leaf 2MB page or PDE. - /// This sets the PTE to preset, accessed, dirty, read write execute. - pub fn set_entry(&mut self, entry_type: PageTableEntryType) { - self.entry = X64_PTE_PRESENT | X64_PTE_ACCESSED | X64_PTE_READ_WRITE; + /// This sets the PTE to preset, accessed, dirty, execute. + pub fn set_entry(&mut self, entry_type: PageTableEntryType, read_only: bool) { + if read_only { + self.entry = X64_PTE_PRESENT | X64_PTE_ACCESSED; + } else { + self.entry = X64_PTE_PRESENT | X64_PTE_ACCESSED | X64_PTE_READ_WRITE; + } match entry_type { PageTableEntryType::Leaf1GbPage(address) => { @@ -282,6 +286,7 @@ pub struct PageTableBuilder { local_map: Option<(u64, u64)>, confidential_bit: Option, map_reset_vector: bool, + read_only: bool, } impl PteOps for PageTableBuilder { @@ -307,6 +312,7 @@ impl PageTableBuilder { size: 0, local_map: None, confidential_bit: None, + read_only: false, map_reset_vector: false, } } @@ -333,6 +339,12 @@ impl PageTableBuilder { self } + /// Map all pages as read only + pub fn with_read_only(mut self, read_only: bool) -> Self { + self.read_only = read_only; + self + } + /// Build a set of X64 page tables identity mapping the given region. `size` must be less than 512GB. /// This creates up to 3+N page tables: 1 PML4E and up to 2 PDPTE tables, and N page tables counted at 1 per GB of size, /// for 2MB mappings. @@ -504,6 +516,7 @@ pub fn build_page_tables_64( address_bias: u64, identity_map_size: IdentityMapSize, pml4e_link: Option<(u64, u64)>, + read_only: bool, ) -> Vec { // Allocate page tables. There are up to 6 total page tables: // 1 PML4E (Level 4) (omitted if the address bias is non-zero) @@ -532,13 +545,13 @@ pub fn build_page_tables_64( // Set PML4E entry linking PML4E to PDPTE. let output_address = page_table_gpa + pdpte_table_index as u64 * X64_PAGE_SIZE; - pml4e_table[0].set_entry(PageTableEntryType::Pde(output_address)); + pml4e_table[0].set_entry(PageTableEntryType::Pde(output_address), read_only); // Set PML4E entry to link the additional entry if specified. if let Some((link_target_gpa, linkage_gpa)) = pml4e_link { assert!((linkage_gpa & 0x7FFFFFFFFF) == 0); pml4e_table[linkage_gpa as usize >> 39] - .set_entry(PageTableEntryType::Pde(link_target_gpa)); + .set_entry(PageTableEntryType::Pde(link_target_gpa), read_only); } pdpte_table @@ -568,11 +581,14 @@ pub fn build_page_tables_64( let output_address = page_table_gpa + pde_table_index as u64 * X64_PAGE_SIZE; let pdpte_entry = &mut pdpte_table[pdpte_index as usize]; assert!(!pdpte_entry.is_present()); - pdpte_entry.set_entry(PageTableEntryType::Pde(output_address)); + pdpte_entry.set_entry(PageTableEntryType::Pde(output_address), read_only); // Set all 2MB entries in this PDE table. for entry in pde_table.iter_mut() { - entry.set_entry(PageTableEntryType::Leaf2MbPage(current_va + address_bias)); + entry.set_entry( + PageTableEntryType::Leaf2MbPage(current_va + address_bias), + read_only, + ); current_va += X64_LARGE_PAGE_SIZE; } } diff --git a/vm/loader/src/linux.rs b/vm/loader/src/linux.rs index 43b0879f43..89de787440 100644 --- a/vm/loader/src/linux.rs +++ b/vm/loader/src/linux.rs @@ -336,6 +336,7 @@ pub fn load_config( 0, IdentityMapSize::Size4Gb, None, + false, ); assert!(page_table.len() as u64 % HV_PAGE_SIZE == 0); importer diff --git a/vm/loader/src/paravisor.rs b/vm/loader/src/paravisor.rs index 9a3b861eba..467a25c1f0 100644 --- a/vm/loader/src/paravisor.rs +++ b/vm/loader/src/paravisor.rs @@ -359,32 +359,10 @@ where } _ => None, }; - - // HACK: On TDX, the kernel uses the ACPI AP Mailbox protocol to start APs. - // However, the kernel assumes that all kernel ram is identity mapped, as - // the kernel will jump to a startup routine in any arbitrary kernel ram - // range. - // - // For now, describe 3GB of memory identity mapped in the page table used by - // the mailbox assembly stub, so the kernel can start APs regardless of how - // large the initial memory size was. An upcoming change will instead have - // the bootshim modify the pagetable at runtime to guarantee all ranges - // reported in the E820 map to kernel as ram are mapped. - // - // FUTURE: A future kernel change could remove this requirement entirely by - // making the kernel spec compliant, and only require that the reset vector - // page is identity mapped. - - let page_table_mapping_size = if isolation_type == IsolationType::Tdx { - 3 * 1024 * 1024 * 1024 - } else { - memory_size - }; - let page_table_base_page_count = 5; let page_table_dynamic_page_count = { // Double the count to allow for simpler reconstruction. - calculate_pde_table_count(memory_start_address, page_table_mapping_size) * 2 + calculate_pde_table_count(memory_start_address, memory_size) * 2 + local_map.map_or(0, |v| calculate_pde_table_count(v.0, v.1)) }; let page_table_isolation_page_count = match isolation_type { @@ -405,7 +383,7 @@ where tracing::debug!(page_table_region_start, page_table_region_size); let mut page_table_builder = PageTableBuilder::new(page_table_region_start) - .with_mapped_region(memory_start_address, page_table_mapping_size); + .with_mapped_region(memory_start_address, memory_size); if let Some((local_map_start, size)) = local_map { page_table_builder = page_table_builder.with_local_map(local_map_start, size); @@ -427,8 +405,6 @@ where let page_table_page_base = page_table_region_start / HV_PAGE_SIZE; assert!(page_table.len() as u64 <= page_table_region_size); - let offset = offset; - if with_relocation { // Indicate relocation information. Don't include page table region. importer.relocation_region( @@ -451,6 +427,55 @@ where )?; } + // TDX-isolated VMs require an AP page table to boot with the mailbox protocol + // + // In the OpenHCL implementation of this protocol, we spin in the architectural reset + // vector until the kernel gives us a vector to jump to. The OpenHCL kernel can place + // this vector anywhere in the lower 4GB of GPA space, so we identity map the lower + // 4GB as R+X + let ap_page_table_region_start = offset; + let ( + ap_page_table, + ap_page_table_page_base, + ap_page_table_region_start, + ap_page_table_region_size, + ap_page_table_page_count, + ) = if isolation_type == IsolationType::Tdx { + let ap_page_table_size = 4 * 1024 * 1024 * 1024; + + // TDX requires up to an extra 3 pages to map the reset vector as a + // 4K page. + let ap_page_table_page_count = (calculate_pde_table_count(0, ap_page_table_size)) + 3; + + let ap_page_table_region_size = HV_PAGE_SIZE * ap_page_table_page_count; + offset += ap_page_table_region_size; + + tracing::debug!(ap_page_table_region_start, ap_page_table_region_size); + + let ap_page_table_builder = PageTableBuilder::new(page_table_region_start) + .with_mapped_region(memory_start_address, memory_size); + + let ap_page_table = ap_page_table_builder + .with_read_only(true) + .with_reset_vector(true) + .build(); + + assert!(ap_page_table.len() as u64 % HV_PAGE_SIZE == 0); + let ap_page_table_page_base = ap_page_table_region_start / HV_PAGE_SIZE; + assert!(ap_page_table.len() as u64 <= ap_page_table_region_size); + ( + Some(ap_page_table), + Some(ap_page_table_page_base), + Some(ap_page_table_region_start), + Some(ap_page_table_region_size), + Some(ap_page_table_page_count), + ) + } else { + (None, None, None, None, None) + }; + + let offset = offset; + // The memory used by the loader must be smaller than the memory available. if offset > memory_start_address + memory_size { return Err(Error::NotEnoughMemory(offset - memory_start_address)); @@ -489,8 +514,8 @@ where used_end: calculate_shim_offset(offset), bounce_buffer_start: bounce_buffer.map_or(0, |r| calculate_shim_offset(r.start())), bounce_buffer_size: bounce_buffer.map_or(0, |r| r.len()), - page_tables_start: calculate_shim_offset(page_table_region_start), - page_tables_size: page_table_region_size, + ap_page_tables_start: ap_page_table_region_start.map_or(0, |t| calculate_shim_offset(t)), + ap_page_tables_size: ap_page_table_region_size.unwrap_or(0), }; tracing::debug!(boot_params_base, "shim gpa"); @@ -513,6 +538,16 @@ where &page_table, )?; + if isolation_type == IsolationType::Tdx { + importer.import_pages( + ap_page_table_page_base.expect("AP page tables are required for TDX"), + ap_page_table_page_count.expect("AP page tables are required for TDX"), + "underhill-ap-page-tables", + BootPageAcceptance::Exclusive, + &ap_page_table.expect("AP page tables are required for TDX"), + )?; + } + // Set selectors and control registers // Setup two selectors and segment registers. // ds, es, fs, gs, ss are linearSelector @@ -1081,8 +1116,8 @@ where used_end: calculate_shim_offset(next_addr), bounce_buffer_start: 0, bounce_buffer_size: 0, - page_tables_start: 0, - page_tables_size: 0, + ap_page_tables_start: 0, + ap_page_tables_size: 0, }; importer diff --git a/vm/loader/src/uefi/mod.rs b/vm/loader/src/uefi/mod.rs index 2f1bf4da00..ab4a584e67 100644 --- a/vm/loader/src/uefi/mod.rs +++ b/vm/loader/src/uefi/mod.rs @@ -426,6 +426,7 @@ pub mod x86_64 { shared_gpa_boundary, IdentityMapSize::Size4Gb, None, + false, ); let page_tables = build_page_tables_64( @@ -433,12 +434,18 @@ pub mod x86_64 { 0, IdentityMapSize::Size4Gb, Some((shared_vis_page_table_gpa, shared_gpa_boundary)), + false, ); (page_tables, Some(shared_vis_page_tables)) } else { - let page_tables = - build_page_tables_64(PAGE_TABLE_GPA_BASE, 0, IdentityMapSize::Size4Gb, None); + let page_tables = build_page_tables_64( + PAGE_TABLE_GPA_BASE, + 0, + IdentityMapSize::Size4Gb, + None, + false, + ); (page_tables, None) };