diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index 655e1c9537376..84302009da999 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -654,6 +654,7 @@ pub(crate) fn run_pass_manager(
     // We then run the llvm_optimize function a second time, to optimize the code which we generated
     // in the enzyme differentiation pass.
     let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
+    let enable_gpu = config.offload.contains(&config::Offload::Enable);
     let stage = if thin {
         write::AutodiffStage::PreAD
     } else {
@@ -668,6 +669,12 @@ pub(crate) fn run_pass_manager(
         write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
     }
 
+    if enable_gpu && !thin {
+        let cx =
+            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
+        crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
+    }
+
     if cfg!(llvm_enzyme) && enable_ad && !thin {
         let cx =
             SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 514923ad6f37f..0ade9edb0d2ea 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -3,6 +3,7 @@ use std::ops::Deref;
 use std::{iter, ptr};
 
 pub(crate) mod autodiff;
+pub(crate) mod gpu_offload;
 
 use libc::{c_char, c_uint, size_t};
 use rustc_abi as abi;
@@ -117,6 +118,74 @@ impl<'a, 'll, CX: Borrow<SCx<'ll>>> GenericBuilder<'a, 'll, CX> {
         }
         bx
     }
+
+    // The generic builder has less functionality and thus (unlike the other alloca) we can not
+    // easily jump to the beginning of the function to place our allocas there. We trust the user
+    // to manually do that. FIXME(offload): improve the genericCx and add more llvm wrappers to
+    // handle this.
+    pub(crate) fn direct_alloca(&mut self, ty: &'ll Type, align: Align, name: &str) -> &'ll Value {
+        let val = unsafe {
+            let alloca = llvm::LLVMBuildAlloca(self.llbuilder, ty, UNNAMED);
+            llvm::LLVMSetAlignment(alloca, align.bytes() as c_uint);
+            // Cast to default addrspace if necessary
+            llvm::LLVMBuildPointerCast(self.llbuilder, alloca, self.cx.type_ptr(), UNNAMED)
+        };
+        if name != "" {
+            let name = std::ffi::CString::new(name).unwrap();
+            llvm::set_value_name(val, &name.as_bytes());
+        }
+        val
+    }
+
+    pub(crate) fn inbounds_gep(
+        &mut self,
+        ty: &'ll Type,
+        ptr: &'ll Value,
+        indices: &[&'ll Value],
+    ) -> &'ll Value {
+        unsafe {
+            llvm::LLVMBuildGEPWithNoWrapFlags(
+                self.llbuilder,
+                ty,
+                ptr,
+                indices.as_ptr(),
+                indices.len() as c_uint,
+                UNNAMED,
+                GEPNoWrapFlags::InBounds,
+            )
+        }
+    }
+
+    pub(crate) fn store(&mut self, val: &'ll Value, ptr: &'ll Value, align: Align) -> &'ll Value {
+        debug!("Store {:?} -> {:?}", val, ptr);
+        assert_eq!(self.cx.type_kind(self.cx.val_ty(ptr)), TypeKind::Pointer);
+        unsafe {
+            let store = llvm::LLVMBuildStore(self.llbuilder, val, ptr);
+            llvm::LLVMSetAlignment(store, align.bytes() as c_uint);
+            store
+        }
+    }
+
+    pub(crate) fn load(&mut self, ty: &'ll Type, ptr: &'ll Value, align: Align) -> &'ll Value {
+        unsafe {
+            let load = llvm::LLVMBuildLoad2(self.llbuilder, ty, ptr, UNNAMED);
+            llvm::LLVMSetAlignment(load, align.bytes() as c_uint);
+            load
+        }
+    }
+
+    fn memset(&mut self, ptr: &'ll Value, fill_byte: &'ll Value, size: &'ll Value, align: Align) {
+        unsafe {
+            llvm::LLVMRustBuildMemSet(
+                self.llbuilder,
+                ptr,
+                align.bytes() as c_uint,
+                fill_byte,
+                size,
+                false,
+            );
+        }
+    }
 }
 
 /// Empty string, to be used where LLVM expects an instruction name, indicating
diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
new file mode 100644
index 0000000000000..1280ab1442a09
--- /dev/null
+++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@@ -0,0 +1,439 @@
+use std::ffi::CString;
+
+use llvm::Linkage::*;
+use rustc_abi::Align;
+use rustc_codegen_ssa::back::write::CodegenContext;
+use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
+
+use crate::builder::SBuilder;
+use crate::common::AsCCharPtr;
+use crate::llvm::AttributePlace::Function;
+use crate::llvm::{self, Linkage, Type, Value};
+use crate::{LlvmCodegenBackend, SimpleCx, attributes};
+
+pub(crate) fn handle_gpu_code<'ll>(
+    _cgcx: &CodegenContext<LlvmCodegenBackend>,
+    cx: &'ll SimpleCx<'_>,
+) {
+    // The offload memory transfer type for each kernel
+    let mut o_types = vec![];
+    let mut kernels = vec![];
+    let offload_entry_ty = add_tgt_offload_entry(&cx);
+    for num in 0..9 {
+        let kernel = cx.get_function(&format!("kernel_{num}"));
+        if let Some(kernel) = kernel {
+            o_types.push(gen_define_handling(&cx, kernel, offload_entry_ty, num));
+            kernels.push(kernel);
+        }
+    }
+
+    gen_call_handling(&cx, &kernels, &o_types);
+}
+
+// What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper:
+// @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+// @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
+    // @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+    let unknown_txt = ";unknown;unknown;0;0;;";
+    let c_entry_name = CString::new(unknown_txt).unwrap();
+    let c_val = c_entry_name.as_bytes_with_nul();
+    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
+    let at_zero = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
+    llvm::set_alignment(at_zero, Align::ONE);
+
+    // @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+    let struct_ident_ty = cx.type_named_struct("struct.ident_t");
+    let struct_elems = vec![
+        cx.get_const_i32(0),
+        cx.get_const_i32(2),
+        cx.get_const_i32(0),
+        cx.get_const_i32(22),
+        at_zero,
+    ];
+    let struct_elems_ty: Vec<_> = struct_elems.iter().map(|&x| cx.val_ty(x)).collect();
+    let initializer = crate::common::named_struct(struct_ident_ty, &struct_elems);
+    cx.set_struct_body(struct_ident_ty, &struct_elems_ty, false);
+    let at_one = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
+    llvm::set_alignment(at_one, Align::EIGHT);
+    at_one
+}
+
+pub(crate) fn add_tgt_offload_entry<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type {
+    let offload_entry_ty = cx.type_named_struct("struct.__tgt_offload_entry");
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+    let ti16 = cx.type_i16();
+    // For each kernel to run on the gpu, we will later generate one entry of this type.
+    // copied from LLVM
+    // typedef struct {
+    //   uint64_t Reserved;
+    //   uint16_t Version;
+    //   uint16_t Kind;
+    //   uint32_t Flags; Flags associated with the entry (see Target Region Entry Flags)
+    //   void *Address; Address of global symbol within device image (function or global)
+    //   char *SymbolName;
+    //   uint64_t Size; Size of the entry info (0 if it is a function)
+    //   uint64_t Data;
+    //   void *AuxAddr;
+    // } __tgt_offload_entry;
+    let entry_elements = vec![ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr];
+    cx.set_struct_body(offload_entry_ty, &entry_elements, false);
+    offload_entry_ty
+}
+
+fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) {
+    let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments");
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+    let tarr = cx.type_array(ti32, 3);
+
+    // Taken from the LLVM APITypes.h declaration:
+    //struct KernelArgsTy {
+    //  uint32_t Version = 0; // Version of this struct for ABI compatibility.
+    //  uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+    //  void **ArgBasePtrs =
+    //      nullptr;                 // Base pointer of each argument (e.g. a struct).
+    //  void **ArgPtrs = nullptr;    // Pointer to the argument data.
+    //  int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+    //  int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+    //  void **ArgNames = nullptr;   // Name of the data for debugging, possibly null.
+    //  void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+    //  uint64_t Tripcount =
+    //      0; // Tripcount for the teams / distribute loop, 0 otherwise.
+    //  struct {
+    //    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
+    //    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+    //    uint64_t Unused : 62;
+    //  } Flags = {0, 0, 0};
+    //  // The number of teams (for x,y,z dimension).
+    //  uint32_t NumTeams[3] = {0, 0, 0};
+    //  // The number of threads (for x,y,z dimension).
+    //  uint32_t ThreadLimit[3] = {0, 0, 0};
+    //  uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
+    //};
+    let kernel_elements =
+        vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32];
+
+    cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false);
+    // For now we don't handle kernels, so for now we just add a global dummy
+    // to make sure that the __tgt_offload_entry is defined and handled correctly.
+    cx.declare_global("my_struct_global2", kernel_arguments_ty);
+}
+
+fn gen_tgt_data_mappers<'ll>(
+    cx: &'ll SimpleCx<'_>,
+) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+
+    let args = vec![tptr, ti64, ti32, tptr, tptr, tptr, tptr, tptr, tptr];
+    let mapper_fn_ty = cx.type_func(&args, cx.type_void());
+    let mapper_begin = "__tgt_target_data_begin_mapper";
+    let mapper_update = "__tgt_target_data_update_mapper";
+    let mapper_end = "__tgt_target_data_end_mapper";
+    let begin_mapper_decl = declare_offload_fn(&cx, mapper_begin, mapper_fn_ty);
+    let update_mapper_decl = declare_offload_fn(&cx, mapper_update, mapper_fn_ty);
+    let end_mapper_decl = declare_offload_fn(&cx, mapper_end, mapper_fn_ty);
+
+    let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx);
+    attributes::apply_to_llfn(begin_mapper_decl, Function, &[nounwind]);
+    attributes::apply_to_llfn(update_mapper_decl, Function, &[nounwind]);
+    attributes::apply_to_llfn(end_mapper_decl, Function, &[nounwind]);
+
+    (begin_mapper_decl, update_mapper_decl, end_mapper_decl, mapper_fn_ty)
+}
+
+fn add_priv_unnamed_arr<'ll>(cx: &SimpleCx<'ll>, name: &str, vals: &[u64]) -> &'ll llvm::Value {
+    let ti64 = cx.type_i64();
+    let mut size_val = Vec::with_capacity(vals.len());
+    for &val in vals {
+        size_val.push(cx.get_const_i64(val));
+    }
+    let initializer = cx.const_array(ti64, &size_val);
+    add_unnamed_global(cx, name, initializer, PrivateLinkage)
+}
+
+pub(crate) fn add_unnamed_global<'ll>(
+    cx: &SimpleCx<'ll>,
+    name: &str,
+    initializer: &'ll llvm::Value,
+    l: Linkage,
+) -> &'ll llvm::Value {
+    let llglobal = add_global(cx, name, initializer, l);
+    llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global);
+    llglobal
+}
+
+pub(crate) fn add_global<'ll>(
+    cx: &SimpleCx<'ll>,
+    name: &str,
+    initializer: &'ll llvm::Value,
+    l: Linkage,
+) -> &'ll llvm::Value {
+    let c_name = CString::new(name).unwrap();
+    let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, cx.val_ty(initializer), &c_name);
+    llvm::set_global_constant(llglobal, true);
+    llvm::set_linkage(llglobal, l);
+    llvm::set_initializer(llglobal, initializer);
+    llglobal
+}
+
+fn gen_define_handling<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    kernel: &'ll llvm::Value,
+    offload_entry_ty: &'ll llvm::Type,
+    num: i64,
+) -> &'ll llvm::Value {
+    let types = cx.func_params_types(cx.get_type_of_global(kernel));
+    // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
+    // reference) types.
+    let num_ptr_types = types
+        .iter()
+        .map(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
+        .count();
+
+    // We do not know their size anymore at this level, so hardcode a placeholder.
+    // A follow-up pr will track these from the frontend, where we still have Rust types.
+    // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
+    // I decided that 1024 bytes is a great placeholder value for now.
+    add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
+    // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
+    // or both to and from the gpu (=3). Other values shouldn't affect us for now.
+    // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
+    // will be 2. For now, everything is 3, until we have our frontend set up.
+    let o_types =
+        add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{num}"), &vec![3; num_ptr_types]);
+    // Next: For each function, generate these three entries. A weak constant,
+    // the llvm.rodata entry name, and  the omp_offloading_entries value
+
+    let name = format!(".kernel_{num}.region_id");
+    let initializer = cx.get_const_i8(0);
+    let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
+
+    let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
+    let c_val = c_entry_name.as_bytes_with_nul();
+    let offload_entry_name = format!(".offloading.entry_name.{num}");
+
+    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
+    let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
+    llvm::set_alignment(llglobal, Align::ONE);
+    llvm::set_section(llglobal, c".llvm.rodata.offloading");
+
+    // Not actively used yet, for calling real kernels
+    let name = format!(".offloading.entry.kernel_{num}");
+
+    // See the __tgt_offload_entry documentation above.
+    let reserved = cx.get_const_i64(0);
+    let version = cx.get_const_i16(1);
+    let kind = cx.get_const_i16(1);
+    let flags = cx.get_const_i32(0);
+    let size = cx.get_const_i64(0);
+    let data = cx.get_const_i64(0);
+    let aux_addr = cx.const_null(cx.type_ptr());
+    let elems = vec![reserved, version, kind, flags, region_id, llglobal, size, data, aux_addr];
+
+    let initializer = crate::common::named_struct(offload_entry_ty, &elems);
+    let c_name = CString::new(name).unwrap();
+    let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
+    llvm::set_global_constant(llglobal, true);
+    llvm::set_linkage(llglobal, WeakAnyLinkage);
+    llvm::set_initializer(llglobal, initializer);
+    llvm::set_alignment(llglobal, Align::ONE);
+    let c_section_name = CString::new(".omp_offloading_entries").unwrap();
+    llvm::set_section(llglobal, &c_section_name);
+    o_types
+}
+
+fn declare_offload_fn<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    name: &str,
+    ty: &'ll llvm::Type,
+) -> &'ll llvm::Value {
+    crate::declare::declare_simple_fn(
+        cx,
+        name,
+        llvm::CallConv::CCallConv,
+        llvm::UnnamedAddr::No,
+        llvm::Visibility::Default,
+        ty,
+    )
+}
+
+// For each kernel *call*, we now use some of our previous declared globals to move data to and from
+// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
+// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
+// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
+// Since in our frontend users (by default) don't have to specify data transfer, this is something
+// we should optimize in the future! We also assume that everything should be copied back and forth,
+// but sometimes we can directly zero-allocate on the device and only move back, or if something is
+// immutable, we might only copy it to the device, but not back.
+//
+// Current steps:
+// 0. Alloca some variables for the following steps
+// 1. set insert point before kernel call.
+// 2. generate all the GEPS and stores, to be used in 3)
+// 3. generate __tgt_target_data_begin calls to move data to the GPU
+//
+// unchanged: keep kernel call. Later move the kernel to the GPU
+//
+// 4. set insert point after kernel call.
+// 5. generate all the GEPS and stores, to be used in 6)
+// 6. generate __tgt_target_data_end calls to move data from the GPU
+fn gen_call_handling<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    _kernels: &[&'ll llvm::Value],
+    o_types: &[&'ll llvm::Value],
+) {
+    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+    let tptr = cx.type_ptr();
+    let ti32 = cx.type_i32();
+    let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr];
+    let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc");
+    cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false);
+
+    gen_tgt_kernel_global(&cx);
+    let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
+
+    let main_fn = cx.get_function("main");
+    let Some(main_fn) = main_fn else { return };
+    let kernel_name = "kernel_1";
+    let call = unsafe {
+        llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
+    };
+    let Some(kernel_call) = call else {
+        return;
+    };
+    let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
+    let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
+    let mut builder = SBuilder::build(cx, kernel_call_bb);
+
+    let types = cx.func_params_types(cx.get_type_of_global(called));
+    let num_args = types.len() as u64;
+
+    // Step 0)
+    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+    // %6 = alloca %struct.__tgt_bin_desc, align 8
+    unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
+
+    let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
+
+    let ty = cx.type_array(cx.type_ptr(), num_args);
+    // Baseptr are just the input pointer to the kernel, stored in a local alloca
+    let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs");
+    // Ptrs are the result of a gep into the baseptr, at least for our trivial types.
+    let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
+    // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
+    let ty2 = cx.type_array(cx.type_i64(), num_args);
+    let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
+    // Now we allocate once per function param, a copy to be passed to one of our maps.
+    let mut vals = vec![];
+    let mut geps = vec![];
+    let i32_0 = cx.get_const_i32(0);
+    for (index, in_ty) in types.iter().enumerate() {
+        // get function arg, store it into the alloca, and read it.
+        let p = llvm::get_param(called, index as u32);
+        let name = llvm::get_value_name(p);
+        let name = str::from_utf8(&name).unwrap();
+        let arg_name = format!("{name}.addr");
+        let alloca = builder.direct_alloca(in_ty, Align::EIGHT, &arg_name);
+
+        builder.store(p, alloca, Align::EIGHT);
+        let val = builder.load(in_ty, alloca, Align::EIGHT);
+        let gep = builder.inbounds_gep(cx.type_f32(), val, &[i32_0]);
+        vals.push(val);
+        geps.push(gep);
+    }
+
+    // Step 1)
+    unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
+    builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
+
+    let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void());
+    let register_lib_decl = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty);
+    let unregister_lib_decl = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty);
+    let init_ty = cx.type_func(&[], cx.type_void());
+    let init_rtls_decl = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);
+
+    // call void @__tgt_register_lib(ptr noundef %6)
+    builder.call(mapper_fn_ty, register_lib_decl, &[tgt_bin_desc_alloca], None);
+    // call void @__tgt_init_all_rtls()
+    builder.call(init_ty, init_rtls_decl, &[], None);
+
+    for i in 0..num_args {
+        let idx = cx.get_const_i32(i);
+        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
+        builder.store(vals[i as usize], gep1, Align::EIGHT);
+        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
+        builder.store(geps[i as usize], gep2, Align::EIGHT);
+        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
+        // As mentioned above, we don't use Rust type information yet. So for now we will just
+        // assume that we have 1024 bytes, 256 f32 values.
+        // FIXME(offload): write an offload frontend and handle arbitrary types.
+        builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
+    }
+
+    // For now we have a very simplistic indexing scheme into our
+    // offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr.
+    fn get_geps<'a, 'll>(
+        builder: &mut SBuilder<'a, 'll>,
+        cx: &'ll SimpleCx<'ll>,
+        ty: &'ll Type,
+        ty2: &'ll Type,
+        a1: &'ll Value,
+        a2: &'ll Value,
+        a4: &'ll Value,
+    ) -> (&'ll Value, &'ll Value, &'ll Value) {
+        let i32_0 = cx.get_const_i32(0);
+
+        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
+        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
+        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]);
+        (gep1, gep2, gep3)
+    }
+
+    fn generate_mapper_call<'a, 'll>(
+        builder: &mut SBuilder<'a, 'll>,
+        cx: &'ll SimpleCx<'ll>,
+        geps: (&'ll Value, &'ll Value, &'ll Value),
+        o_type: &'ll Value,
+        fn_to_call: &'ll Value,
+        fn_ty: &'ll Type,
+        num_args: u64,
+        s_ident_t: &'ll Value,
+    ) {
+        let nullptr = cx.const_null(cx.type_ptr());
+        let i64_max = cx.get_const_i64(u64::MAX);
+        let num_args = cx.get_const_i32(num_args);
+        let args =
+            vec![s_ident_t, i64_max, num_args, geps.0, geps.1, geps.2, o_type, nullptr, nullptr];
+        builder.call(fn_ty, fn_to_call, &args, None);
+    }
+
+    // Step 2)
+    let s_ident_t = generate_at_one(&cx);
+    let o = o_types[0];
+    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
+    generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
+
+    // Step 3)
+    // Here we will add code for the actual kernel launches in a follow-up PR.
+    // FIXME(offload): launch kernels
+
+    // Step 4)
+    unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) };
+
+    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
+    generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
+
+    builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
+
+    // With this we generated the following begin and end mappers. We could easily generate the
+    // update mapper in an update.
+    // call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 3, ptr %27, ptr %28, ptr %29, ptr @.offload_maptypes, ptr null, ptr null)
+    // call void @__tgt_target_data_update_mapper(ptr @1, i64 -1, i32 2, ptr %46, ptr %47, ptr %48, ptr @.offload_maptypes.1, ptr null, ptr null)
+    // call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 3, ptr %49, ptr %50, ptr %51, ptr @.offload_maptypes, ptr null, ptr null)
+}
diff --git a/compiler/rustc_codegen_llvm/src/common.rs b/compiler/rustc_codegen_llvm/src/common.rs
index f9ab96b578951..f29fefb66f0fe 100644
--- a/compiler/rustc_codegen_llvm/src/common.rs
+++ b/compiler/rustc_codegen_llvm/src/common.rs
@@ -118,6 +118,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             r
         }
     }
+
+    pub(crate) fn const_null(&self, t: &'ll Type) -> &'ll Value {
+        unsafe { llvm::LLVMConstNull(t) }
+    }
 }
 
 impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
@@ -377,6 +381,11 @@ pub(crate) fn bytes_in_context<'ll>(llcx: &'ll llvm::Context, bytes: &[u8]) -> &
     }
 }
 
+pub(crate) fn named_struct<'ll>(ty: &'ll Type, elts: &[&'ll Value]) -> &'ll Value {
+    let len = c_uint::try_from(elts.len()).expect("LLVMConstStructInContext elements len overflow");
+    unsafe { llvm::LLVMConstNamedStruct(ty, elts.as_ptr(), len) }
+}
+
 fn struct_in_context<'ll>(
     llcx: &'ll llvm::Context,
     elts: &[&'ll Value],
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index 34bed2a1d2a3d..ee77774c68832 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -216,7 +216,7 @@ pub(crate) unsafe fn create_module<'ll>(
 
     // Ensure the data-layout values hardcoded remain the defaults.
     {
-        let tm = crate::back::write::create_informational_target_machine(tcx.sess, false);
+        let tm = crate::back::write::create_informational_target_machine(sess, false);
         unsafe {
             llvm::LLVMRustSetDataLayoutFromTargetMachine(llmod, tm.raw());
         }
@@ -685,6 +685,22 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
         unsafe { llvm::LLVMConstInt(ty, val, llvm::False) }
     }
 
+    pub(crate) fn get_const_i64(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i64(), n)
+    }
+
+    pub(crate) fn get_const_i32(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i32(), n)
+    }
+
+    pub(crate) fn get_const_i16(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i16(), n)
+    }
+
+    pub(crate) fn get_const_i8(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i8(), n)
+    }
+
     pub(crate) fn get_function(&self, name: &str) -> Option<&'ll Value> {
         let name = SmallCStr::new(name);
         unsafe { llvm::LLVMGetNamedFunction((**self).borrow().llmod, name.as_ptr()) }
diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs
index eb75716d768bb..960a895a2031c 100644
--- a/compiler/rustc_codegen_llvm/src/declare.rs
+++ b/compiler/rustc_codegen_llvm/src/declare.rs
@@ -215,7 +215,9 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
 
         llfn
     }
+}
 
+impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
     /// Declare a global with an intention to define it.
     ///
     /// Use this function when you intend to define a global. This function will
@@ -234,13 +236,13 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
     ///
     /// Use this function when you intend to define a global without a name.
     pub(crate) fn define_private_global(&self, ty: &'ll Type) -> &'ll Value {
-        unsafe { llvm::LLVMRustInsertPrivateGlobal(self.llmod, ty) }
+        unsafe { llvm::LLVMRustInsertPrivateGlobal(self.llmod(), ty) }
     }
 
     /// Gets declared value by name.
     pub(crate) fn get_declared_value(&self, name: &str) -> Option<&'ll Value> {
         debug!("get_declared_value(name={:?})", name);
-        unsafe { llvm::LLVMRustGetNamedValue(self.llmod, name.as_c_char_ptr(), name.len()) }
+        unsafe { llvm::LLVMRustGetNamedValue(self.llmod(), name.as_c_char_ptr(), name.len()) }
     }
 
     /// Gets defined or externally defined (AvailableExternally linkage) value by
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index 6db4e122ad6e8..aaf21f9ada9a5 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -412,6 +412,20 @@ impl ModuleLlvm {
         }
     }
 
+    fn tm_from_cgcx(
+        cgcx: &CodegenContext<LlvmCodegenBackend>,
+        name: &str,
+        dcx: DiagCtxtHandle<'_>,
+    ) -> Result<OwnedTargetMachine, FatalError> {
+        let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name);
+        match (cgcx.tm_factory)(tm_factory_config) {
+            Ok(m) => Ok(m),
+            Err(e) => {
+                return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
+            }
+        }
+    }
+
     fn parse(
         cgcx: &CodegenContext<LlvmCodegenBackend>,
         name: &CStr,
@@ -421,13 +435,7 @@ impl ModuleLlvm {
         unsafe {
             let llcx = llvm::LLVMRustContextCreate(cgcx.fewer_names);
             let llmod_raw = back::lto::parse_module(llcx, name, buffer, dcx)?;
-            let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name.to_str().unwrap());
-            let tm = match (cgcx.tm_factory)(tm_factory_config) {
-                Ok(m) => m,
-                Err(e) => {
-                    return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
-                }
-            };
+            let tm = ModuleLlvm::tm_from_cgcx(cgcx, name.to_str().unwrap(), dcx)?;
 
             Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) })
         }
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
index c696b8d8ff25f..56d756e52cce1 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -4,7 +4,7 @@ use libc::{c_char, c_uint};
 
 use super::MetadataKindId;
 use super::ffi::{AttributeKind, BasicBlock, Metadata, Module, Type, Value};
-use crate::llvm::Bool;
+use crate::llvm::{Bool, Builder};
 
 #[link(name = "llvm-wrapper", kind = "static")]
 unsafe extern "C" {
@@ -31,6 +31,14 @@ unsafe extern "C" {
         index: c_uint,
         kind: AttributeKind,
     );
+    pub(crate) fn LLVMRustPositionBefore<'a>(B: &'a Builder<'_>, I: &'a Value);
+    pub(crate) fn LLVMRustPositionAfter<'a>(B: &'a Builder<'_>, I: &'a Value);
+    pub(crate) fn LLVMRustGetFunctionCall(
+        F: &Value,
+        name: *const c_char,
+        NameLen: libc::size_t,
+    ) -> Option<&Value>;
+
 }
 
 unsafe extern "C" {
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 80a0e5c5accc2..edfb29dd1be72 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -1138,6 +1138,11 @@ unsafe extern "C" {
         Count: c_uint,
         Packed: Bool,
     ) -> &'a Value;
+    pub(crate) fn LLVMConstNamedStruct<'a>(
+        StructTy: &'a Type,
+        ConstantVals: *const &'a Value,
+        Count: c_uint,
+    ) -> &'a Value;
     pub(crate) fn LLVMConstVector(ScalarConstantVals: *const &Value, Size: c_uint) -> &Value;
 
     // Constant expressions
@@ -1217,6 +1222,8 @@ unsafe extern "C" {
     ) -> &'a BasicBlock;
 
     // Operations on instructions
+    pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
+    pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
     pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
     pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
     pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
@@ -2557,6 +2564,7 @@ unsafe extern "C" {
 
     pub(crate) fn LLVMRustSetDataLayoutFromTargetMachine<'a>(M: &'a Module, TM: &'a TargetMachine);
 
+    pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
     pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
 
     pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index 50a7cba300b4b..24e0a4eb53331 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -120,6 +120,7 @@ pub struct ModuleConfig {
     pub emit_lifetime_markers: bool,
     pub llvm_plugins: Vec<String>,
     pub autodiff: Vec<config::AutoDiff>,
+    pub offload: Vec<config::Offload>,
 }
 
 impl ModuleConfig {
@@ -268,6 +269,7 @@ impl ModuleConfig {
             emit_lifetime_markers: sess.emit_lifetime_markers(),
             llvm_plugins: if_regular!(sess.opts.unstable_opts.llvm_plugins.clone(), vec![]),
             autodiff: if_regular!(sess.opts.unstable_opts.autodiff.clone(), vec![]),
+            offload: if_regular!(sess.opts.unstable_opts.offload.clone(), vec![]),
         }
     }
 
diff --git a/compiler/rustc_interface/src/tests.rs b/compiler/rustc_interface/src/tests.rs
index 360b5629e9d6e..8771bb4405049 100644
--- a/compiler/rustc_interface/src/tests.rs
+++ b/compiler/rustc_interface/src/tests.rs
@@ -13,10 +13,10 @@ use rustc_session::config::{
     CoverageOptions, DebugInfo, DumpMonoStatsFormat, ErrorOutputType, ExternEntry, ExternLocation,
     Externs, FmtDebug, FunctionReturn, InliningThreshold, Input, InstrumentCoverage,
     InstrumentXRay, LinkSelfContained, LinkerPluginLto, LocationDetail, LtoCli, MirIncludeSpans,
-    NextSolverConfig, OomStrategy, Options, OutFileName, OutputType, OutputTypes, PAuthKey, PacRet,
-    Passes, PatchableFunctionEntry, Polonius, ProcMacroExecutionStrategy, Strip, SwitchWithOptPath,
-    SymbolManglingVersion, WasiExecModel, build_configuration, build_session_options,
-    rustc_optgroups,
+    NextSolverConfig, Offload, OomStrategy, Options, OutFileName, OutputType, OutputTypes,
+    PAuthKey, PacRet, Passes, PatchableFunctionEntry, Polonius, ProcMacroExecutionStrategy, Strip,
+    SwitchWithOptPath, SymbolManglingVersion, WasiExecModel, build_configuration,
+    build_session_options, rustc_optgroups,
 };
 use rustc_session::lint::Level;
 use rustc_session::search_paths::SearchPath;
@@ -833,6 +833,7 @@ fn test_unstable_options_tracking_hash() {
     tracked!(no_profiler_runtime, true);
     tracked!(no_trait_vptr, true);
     tracked!(no_unique_section_names, true);
+    tracked!(offload, vec![Offload::Enable]);
     tracked!(on_broken_pipe, OnBrokenPipe::Kill);
     tracked!(oom, OomStrategy::Panic);
     tracked!(osx_rpath_install_name, true);
diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
index 90aa9188c8300..82568ed4ae177 100644
--- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@@ -1591,12 +1591,49 @@ extern "C" LLVMValueRef LLVMRustBuildMemSet(LLVMBuilderRef B, LLVMValueRef Dst,
                                       MaybeAlign(DstAlign), IsVolatile));
 }
 
+extern "C" void LLVMRustPositionBuilderPastAllocas(LLVMBuilderRef B,
+                                                   LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  unwrap(B)->SetInsertPointPastAllocas(F);
+}
 extern "C" void LLVMRustPositionBuilderAtStart(LLVMBuilderRef B,
                                                LLVMBasicBlockRef BB) {
   auto Point = unwrap(BB)->getFirstInsertionPt();
   unwrap(B)->SetInsertPoint(unwrap(BB), Point);
 }
 
+extern "C" void LLVMRustPositionBefore(LLVMBuilderRef B, LLVMValueRef Instr) {
+  if (auto I = dyn_cast<Instruction>(unwrap<Value>(Instr))) {
+    unwrap(B)->SetInsertPoint(I);
+  }
+}
+
+extern "C" void LLVMRustPositionAfter(LLVMBuilderRef B, LLVMValueRef Instr) {
+  if (auto I = dyn_cast<Instruction>(unwrap<Value>(Instr))) {
+    auto J = I->getNextNonDebugInstruction();
+    unwrap(B)->SetInsertPoint(J);
+  }
+}
+
+extern "C" LLVMValueRef
+LLVMRustGetFunctionCall(LLVMValueRef Fn, const char *Name, size_t NameLen) {
+  auto targetName = StringRef(Name, NameLen);
+  Function *F = unwrap<Function>(Fn);
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      if (auto *callInst = llvm::dyn_cast<llvm::CallBase>(&I)) {
+        const llvm::Function *calledFunc = callInst->getCalledFunction();
+        if (calledFunc && calledFunc->getName() == targetName) {
+          // Found a call to the target function
+          return wrap(callInst);
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
 extern "C" bool LLVMRustConstIntGetZExtValue(LLVMValueRef CV, uint64_t *value) {
   auto C = unwrap<llvm::ConstantInt>(CV);
   if (C->getBitWidth() > 64)
diff --git a/compiler/rustc_middle/src/ty/inhabitedness/mod.rs b/compiler/rustc_middle/src/ty/inhabitedness/mod.rs
index 2a336cc21f492..7eb74b52b44c6 100644
--- a/compiler/rustc_middle/src/ty/inhabitedness/mod.rs
+++ b/compiler/rustc_middle/src/ty/inhabitedness/mod.rs
@@ -43,7 +43,6 @@
 //! This code should only compile in modules where the uninhabitedness of `Foo`
 //! is visible.
 
-use rustc_span::sym;
 use rustc_type_ir::TyKind::*;
 use tracing::instrument;
 
@@ -85,21 +84,6 @@ impl<'tcx> VariantDef {
         InhabitedPredicate::all(
             tcx,
             self.fields.iter().map(|field| {
-                // Unstable fields are always considered to be inhabited. In the future,
-                // this could be extended to be conditional on the field being unstable
-                // only within the module that's querying the inhabitedness, like:
-                //     `let pred = pred.or(InhabitedPredicate::IsUnstable(field.did));`
-                // but this is unnecessary for now, since it would only affect nightly-only
-                // code or code within the standard library itself.
-                // HACK: We filter out `rustc_private` fields since with the flag
-                // `-Zforce-unstable-if-unmarked` we consider all unmarked fields to be
-                // unstable when building the compiler.
-                if tcx
-                    .lookup_stability(field.did)
-                    .is_some_and(|stab| stab.is_unstable() && stab.feature != sym::rustc_private)
-                {
-                    return InhabitedPredicate::True;
-                }
                 let pred = tcx.type_of(field.did).instantiate_identity().inhabited_predicate(tcx);
                 if adt.is_enum() {
                     return pred;
diff --git a/compiler/rustc_mir_transform/src/elaborate_drops.rs b/compiler/rustc_mir_transform/src/elaborate_drops.rs
index b4fa2be1d00e7..58dff4514a04d 100644
--- a/compiler/rustc_mir_transform/src/elaborate_drops.rs
+++ b/compiler/rustc_mir_transform/src/elaborate_drops.rs
@@ -253,8 +253,8 @@ struct ElaborateDropsCtxt<'a, 'tcx> {
 }
 
 impl fmt::Debug for ElaborateDropsCtxt<'_, '_> {
-    fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        Ok(())
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ElaborateDropsCtxt").finish_non_exhaustive()
     }
 }
 
diff --git a/compiler/rustc_mir_transform/src/shim.rs b/compiler/rustc_mir_transform/src/shim.rs
index cdbc74cdfa841..6c65b072bec08 100644
--- a/compiler/rustc_mir_transform/src/shim.rs
+++ b/compiler/rustc_mir_transform/src/shim.rs
@@ -434,8 +434,8 @@ pub(super) struct DropShimElaborator<'a, 'tcx> {
 }
 
 impl fmt::Debug for DropShimElaborator<'_, '_> {
-    fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
-        Ok(())
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+        f.debug_struct("DropShimElaborator").finish_non_exhaustive()
     }
 }
 
diff --git a/compiler/rustc_pattern_analysis/src/rustc.rs b/compiler/rustc_pattern_analysis/src/rustc.rs
index ee72b676b38e1..0c1b0d622f229 100644
--- a/compiler/rustc_pattern_analysis/src/rustc.rs
+++ b/compiler/rustc_pattern_analysis/src/rustc.rs
@@ -16,7 +16,7 @@ use rustc_middle::ty::{
 };
 use rustc_middle::{bug, span_bug};
 use rustc_session::lint;
-use rustc_span::{DUMMY_SP, ErrorGuaranteed, Span, sym};
+use rustc_span::{DUMMY_SP, ErrorGuaranteed, Span};
 
 use crate::constructor::Constructor::*;
 use crate::constructor::{
@@ -238,10 +238,7 @@ impl<'p, 'tcx: 'p> RustcPatCtxt<'p, 'tcx> {
                         let is_visible =
                             adt.is_enum() || field.vis.is_accessible_from(cx.module, cx.tcx);
                         let is_uninhabited = cx.is_uninhabited(*ty);
-                        let is_unstable = cx.tcx.lookup_stability(field.did).is_some_and(|stab| {
-                            stab.is_unstable() && stab.feature != sym::rustc_private
-                        });
-                        let skip = is_uninhabited && (!is_visible || is_unstable);
+                        let skip = is_uninhabited && !is_visible;
                         (ty, PrivateUninhabitedField(skip))
                     });
                     cx.dropless_arena.alloc_from_iter(tys)
diff --git a/compiler/rustc_session/src/config.rs b/compiler/rustc_session/src/config.rs
index d6215e1de043a..7bea8685724ad 100644
--- a/compiler/rustc_session/src/config.rs
+++ b/compiler/rustc_session/src/config.rs
@@ -226,6 +226,13 @@ pub enum CoverageLevel {
     Mcdc,
 }
 
+// The different settings that the `-Z offload` flag can have.
+#[derive(Clone, Copy, PartialEq, Hash, Debug)]
+pub enum Offload {
+    /// Enable the llvm offload pipeline
+    Enable,
+}
+
 /// The different settings that the `-Z autodiff` flag can have.
 #[derive(Clone, PartialEq, Hash, Debug)]
 pub enum AutoDiff {
@@ -2706,6 +2713,15 @@ pub fn build_session_options(early_dcx: &mut EarlyDiagCtxt, matches: &getopts::M
         )
     }
 
+    if !nightly_options::is_unstable_enabled(matches)
+        && unstable_opts.offload.contains(&Offload::Enable)
+    {
+        early_dcx.early_fatal(
+            "`-Zoffload=Enable` also requires `-Zunstable-options` \
+                and a nightly compiler",
+        )
+    }
+
     let target_triple = parse_target_triple(early_dcx, matches);
 
     // Ensure `-Z unstable-options` is required when using the unstable `-C link-self-contained` and
@@ -3178,7 +3194,7 @@ pub(crate) mod dep_tracking {
         AutoDiff, BranchProtection, CFGuard, CFProtection, CollapseMacroDebuginfo, CoverageOptions,
         CrateType, DebugInfo, DebugInfoCompression, ErrorOutputType, FmtDebug, FunctionReturn,
         InliningThreshold, InstrumentCoverage, InstrumentXRay, LinkerPluginLto, LocationDetail,
-        LtoCli, MirStripDebugInfo, NextSolverConfig, OomStrategy, OptLevel, OutFileName,
+        LtoCli, MirStripDebugInfo, NextSolverConfig, Offload, OomStrategy, OptLevel, OutFileName,
         OutputType, OutputTypes, PatchableFunctionEntry, Polonius, RemapPathScopeComponents,
         ResolveDocLinks, SourceFileHashAlgorithm, SplitDwarfKind, SwitchWithOptPath,
         SymbolManglingVersion, WasiExecModel,
@@ -3225,6 +3241,7 @@ pub(crate) mod dep_tracking {
     impl_dep_tracking_hash_via_hash!(
         (),
         AutoDiff,
+        Offload,
         bool,
         usize,
         NonZero<usize>,
diff --git a/compiler/rustc_session/src/options.rs b/compiler/rustc_session/src/options.rs
index 2bdde2f887a30..b33e3815ea449 100644
--- a/compiler/rustc_session/src/options.rs
+++ b/compiler/rustc_session/src/options.rs
@@ -726,6 +726,7 @@ mod desc {
     pub(crate) const parse_list_with_polarity: &str =
         "a comma-separated list of strings, with elements beginning with + or -";
     pub(crate) const parse_autodiff: &str = "a comma separated list of settings: `Enable`, `PrintSteps`, `PrintTA`, `PrintTAFn`, `PrintAA`, `PrintPerf`, `PrintModBefore`, `PrintModAfter`, `PrintModFinal`, `PrintPasses`, `NoPostopt`, `LooseTypes`, `Inline`";
+    pub(crate) const parse_offload: &str = "a comma separated list of settings: `Enable`";
     pub(crate) const parse_comma_list: &str = "a comma-separated list of strings";
     pub(crate) const parse_opt_comma_list: &str = parse_comma_list;
     pub(crate) const parse_number: &str = "a number";
@@ -1357,6 +1358,27 @@ pub mod parse {
         }
     }
 
+    pub(crate) fn parse_offload(slot: &mut Vec<Offload>, v: Option<&str>) -> bool {
+        let Some(v) = v else {
+            *slot = vec![];
+            return true;
+        };
+        let mut v: Vec<&str> = v.split(",").collect();
+        v.sort_unstable();
+        for &val in v.iter() {
+            let variant = match val {
+                "Enable" => Offload::Enable,
+                _ => {
+                    // FIXME(ZuseZ4): print an error saying which value is not recognized
+                    return false;
+                }
+            };
+            slot.push(variant);
+        }
+
+        true
+    }
+
     pub(crate) fn parse_autodiff(slot: &mut Vec<AutoDiff>, v: Option<&str>) -> bool {
         let Some(v) = v else {
             *slot = vec![];
@@ -2401,6 +2423,11 @@ options! {
         "do not use unique names for text and data sections when -Z function-sections is used"),
     normalize_docs: bool = (false, parse_bool, [TRACKED],
         "normalize associated items in rustdoc when generating documentation"),
+    offload: Vec<crate::config::Offload> = (Vec::new(), parse_offload, [TRACKED],
+        "a list of offload flags to enable
+        Mandatory setting:
+        `=Enable`
+        Currently the only option available"),
     on_broken_pipe: OnBrokenPipe = (OnBrokenPipe::Default, parse_on_broken_pipe, [TRACKED],
         "behavior of std::io::ErrorKind::BrokenPipe (SIGPIPE)"),
     oom: OomStrategy = (OomStrategy::Abort, parse_oom_strategy, [TRACKED],
diff --git a/library/core/src/num/f32.rs b/library/core/src/num/f32.rs
index f8344da79ad40..6c7ba491971c5 100644
--- a/library/core/src/num/f32.rs
+++ b/library/core/src/num/f32.rs
@@ -1936,8 +1936,8 @@ pub mod math {
     /// let abs_difference_x = (f32::math::abs_sub(x, 1.0) - 2.0).abs();
     /// let abs_difference_y = (f32::math::abs_sub(y, 1.0) - 0.0).abs();
     ///
-    /// assert!(abs_difference_x <= f32::EPSILON);
-    /// assert!(abs_difference_y <= f32::EPSILON);
+    /// assert!(abs_difference_x <= 1e-6);
+    /// assert!(abs_difference_y <= 1e-6);
     /// ```
     ///
     /// _This standalone function is for testing only.
@@ -1982,7 +1982,7 @@ pub mod math {
     /// // x^(1/3) - 2 == 0
     /// let abs_difference = (f32::math::cbrt(x) - 2.0).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     ///
     /// _This standalone function is for testing only.
diff --git a/library/std/src/num/f32.rs b/library/std/src/num/f32.rs
index 2bff73add33d6..2c7752cc137f3 100644
--- a/library/std/src/num/f32.rs
+++ b/library/std/src/num/f32.rs
@@ -582,8 +582,8 @@ impl f32 {
     /// let abs_difference_x = (x.abs_sub(1.0) - 2.0).abs();
     /// let abs_difference_y = (y.abs_sub(1.0) - 0.0).abs();
     ///
-    /// assert!(abs_difference_x <= f32::EPSILON);
-    /// assert!(abs_difference_y <= f32::EPSILON);
+    /// assert!(abs_difference_x <= 1e-6);
+    /// assert!(abs_difference_y <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -621,7 +621,7 @@ impl f32 {
     /// // x^(1/3) - 2 == 0
     /// let abs_difference = (x.cbrt() - 2.0).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -652,7 +652,7 @@ impl f32 {
     /// // sqrt(x^2 + y^2)
     /// let abs_difference = (x.hypot(y) - (x.powi(2) + y.powi(2)).sqrt()).abs();
     ///
-    /// assert!(abs_difference <= 1e-6);
+    /// assert!(abs_difference <= 1e-5);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -725,7 +725,7 @@ impl f32 {
     /// let x = std::f32::consts::FRAC_PI_4;
     /// let abs_difference = (x.tan() - 1.0).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -813,7 +813,7 @@ impl f32 {
     /// // atan(tan(1))
     /// let abs_difference = (f.tan().atan() - 1.0).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[doc(alias = "arctan")]
     #[rustc_allow_incoherent_impl]
@@ -854,8 +854,8 @@ impl f32 {
     /// let abs_difference_1 = (y1.atan2(x1) - (-std::f32::consts::FRAC_PI_4)).abs();
     /// let abs_difference_2 = (y2.atan2(x2) - (3.0 * std::f32::consts::FRAC_PI_4)).abs();
     ///
-    /// assert!(abs_difference_1 <= f32::EPSILON);
-    /// assert!(abs_difference_2 <= f32::EPSILON);
+    /// assert!(abs_difference_1 <= 1e-5);
+    /// assert!(abs_difference_2 <= 1e-5);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -982,7 +982,7 @@ impl f32 {
     /// let g = ((e * e) - 1.0) / (2.0 * e);
     /// let abs_difference = (f - g).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -1012,7 +1012,7 @@ impl f32 {
     /// let abs_difference = (f - g).abs();
     ///
     /// // Same result
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -1042,7 +1042,7 @@ impl f32 {
     /// let g = (1.0 - e.powi(-2)) / (1.0 + e.powi(-2));
     /// let abs_difference = (f - g).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -1095,7 +1095,7 @@ impl f32 {
     ///
     /// let abs_difference = (f - x).abs();
     ///
-    /// assert!(abs_difference <= 1e-6);
+    /// assert!(abs_difference <= 1e-5);
     /// ```
     #[doc(alias = "arccosh")]
     #[rustc_allow_incoherent_impl]
@@ -1125,7 +1125,7 @@ impl f32 {
     ///
     /// let abs_difference = (f - e).abs();
     ///
-    /// assert!(abs_difference <= 1e-5);
+    /// assert!(abs_difference <= 1e-4);
     /// ```
     #[doc(alias = "arctanh")]
     #[rustc_allow_incoherent_impl]
@@ -1153,7 +1153,7 @@ impl f32 {
     ///
     /// let abs_difference = (x.gamma() - 24.0).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-4);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -1248,7 +1248,7 @@ impl f32 {
     /// let one = x.erf() + x.erfc();
     /// let abs_difference = (one - 1.0).abs();
     ///
-    /// assert!(abs_difference <= f32::EPSILON);
+    /// assert!(abs_difference <= 1e-6);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
diff --git a/library/std/src/num/f64.rs b/library/std/src/num/f64.rs
index b71e319f4074f..76c45c5c2e806 100644
--- a/library/std/src/num/f64.rs
+++ b/library/std/src/num/f64.rs
@@ -1153,7 +1153,7 @@ impl f64 {
     ///
     /// let abs_difference = (x.gamma() - 24.0).abs();
     ///
-    /// assert!(abs_difference <= f64::EPSILON);
+    /// assert!(abs_difference <= 1e-13);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
@@ -1248,7 +1248,7 @@ impl f64 {
     /// let one = x.erf() + x.erfc();
     /// let abs_difference = (one - 1.0).abs();
     ///
-    /// assert!(abs_difference <= f64::EPSILON);
+    /// assert!(abs_difference <= 1e-13);
     /// ```
     #[rustc_allow_incoherent_impl]
     #[must_use = "method returns a new number and does not mutate the original value"]
diff --git a/library/std/tests/floats/f32.rs b/library/std/tests/floats/f32.rs
index 38c906c1d8771..bea9e8282a690 100644
--- a/library/std/tests/floats/f32.rs
+++ b/library/std/tests/floats/f32.rs
@@ -79,7 +79,7 @@ fn test_log() {
     let nan: f32 = f32::NAN;
     let inf: f32 = f32::INFINITY;
     let neg_inf: f32 = f32::NEG_INFINITY;
-    assert_approx_eq!(10.0f32.log(10.0), 1.0);
+    assert_approx_eq!(10.0f32.log(10.0), 1.0, APPROX_DELTA);
     assert_approx_eq!(2.3f32.log(3.5), 0.664858);
     assert_approx_eq!(1.0f32.exp().log(1.0f32.exp()), 1.0, APPROX_DELTA);
     assert!(1.0f32.log(1.0).is_nan());
@@ -140,10 +140,10 @@ fn test_asinh() {
     assert_approx_eq!(2.0f32.asinh(), 1.443635475178810342493276740273105f32);
     assert_approx_eq!((-2.0f32).asinh(), -1.443635475178810342493276740273105f32);
     // regression test for the catastrophic cancellation fixed in 72486
-    assert_approx_eq!((-3000.0f32).asinh(), -8.699514775987968673236893537700647f32);
+    assert_approx_eq!((-3000.0f32).asinh(), -8.699514775987968673236893537700647f32, APPROX_DELTA);
 
     // test for low accuracy from issue 104548
-    assert_approx_eq!(60.0f32, 60.0f32.sinh().asinh());
+    assert_approx_eq!(60.0f32, 60.0f32.sinh().asinh(), APPROX_DELTA);
     // mul needed for approximate comparison to be meaningful
     assert_approx_eq!(1.0f32, 1e-15f32.sinh().asinh() * 1e15f32);
 }
@@ -196,10 +196,10 @@ fn test_gamma() {
     assert_approx_eq!(1.0f32.gamma(), 1.0f32);
     assert_approx_eq!(2.0f32.gamma(), 1.0f32);
     assert_approx_eq!(3.0f32.gamma(), 2.0f32);
-    assert_approx_eq!(4.0f32.gamma(), 6.0f32);
-    assert_approx_eq!(5.0f32.gamma(), 24.0f32);
+    assert_approx_eq!(4.0f32.gamma(), 6.0f32, APPROX_DELTA);
+    assert_approx_eq!(5.0f32.gamma(), 24.0f32, APPROX_DELTA);
     assert_approx_eq!(0.5f32.gamma(), consts::PI.sqrt());
-    assert_approx_eq!((-0.5f32).gamma(), -2.0 * consts::PI.sqrt());
+    assert_approx_eq!((-0.5f32).gamma(), -2.0 * consts::PI.sqrt(), APPROX_DELTA);
     assert_eq!(0.0f32.gamma(), f32::INFINITY);
     assert_eq!((-0.0f32).gamma(), f32::NEG_INFINITY);
     assert!((-1.0f32).gamma().is_nan());
@@ -218,7 +218,7 @@ fn test_ln_gamma() {
     assert_eq!(2.0f32.ln_gamma().1, 1);
     assert_approx_eq!(3.0f32.ln_gamma().0, 2.0f32.ln());
     assert_eq!(3.0f32.ln_gamma().1, 1);
-    assert_approx_eq!((-0.5f32).ln_gamma().0, (2.0 * consts::PI.sqrt()).ln());
+    assert_approx_eq!((-0.5f32).ln_gamma().0, (2.0 * consts::PI.sqrt()).ln(), APPROX_DELTA);
     assert_eq!((-0.5f32).ln_gamma().1, -1);
 }
 
diff --git a/src/doc/unstable-book/src/compiler-flags/offload.md b/src/doc/unstable-book/src/compiler-flags/offload.md
new file mode 100644
index 0000000000000..4266e8c11a285
--- /dev/null
+++ b/src/doc/unstable-book/src/compiler-flags/offload.md
@@ -0,0 +1,8 @@
+# `offload`
+
+The tracking issue for this feature is: [#131513](https://github.com/rust-lang/rust/issues/131513).
+
+------------------------
+
+This feature will later allow you to run functions on GPUs. It is work in progress.
+Set the `-Zoffload=Enable` compiler flag to experiment with it.
diff --git a/src/tools/enzyme b/src/tools/enzyme
index b5098d515d5e1..2cccfba93c165 160000
--- a/src/tools/enzyme
+++ b/src/tools/enzyme
@@ -1 +1 @@
-Subproject commit b5098d515d5e1bd0f5470553bc0d18da9794ca8b
+Subproject commit 2cccfba93c1650f26f1cf8be8aa875a7c1d23fb3
diff --git a/src/tools/miri/src/intrinsics/mod.rs b/src/tools/miri/src/intrinsics/mod.rs
index 4efa7dd4dcf81..1ffba2726a2a9 100644
--- a/src/tools/miri/src/intrinsics/mod.rs
+++ b/src/tools/miri/src/intrinsics/mod.rs
@@ -3,20 +3,16 @@
 mod atomic;
 mod simd;
 
-use std::ops::Neg;
-
 use rand::Rng;
 use rustc_abi::Size;
-use rustc_apfloat::ieee::{IeeeFloat, Semantics};
 use rustc_apfloat::{self, Float, Round};
 use rustc_middle::mir;
-use rustc_middle::ty::{self, FloatTy, ScalarInt};
+use rustc_middle::ty::{self, FloatTy};
 use rustc_span::{Symbol, sym};
 
 use self::atomic::EvalContextExt as _;
 use self::helpers::{ToHost, ToSoft, check_intrinsic_arg_count};
 use self::simd::EvalContextExt as _;
-use crate::math::{IeeeExt, apply_random_float_error_ulp};
 use crate::*;
 
 impl<'tcx> EvalContextExt<'tcx> for crate::MiriInterpCx<'tcx> {}
@@ -191,7 +187,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let [f] = check_intrinsic_arg_count(args)?;
                 let f = this.read_scalar(f)?.to_f32()?;
 
-                let res = fixed_float_value(this, intrinsic_name, &[f]).unwrap_or_else(|| {
+                let res = math::fixed_float_value(this, intrinsic_name, &[f]).unwrap_or_else(|| {
                     // Using host floats (but it's fine, these operations do not have
                     // guaranteed precision).
                     let host = f.to_host();
@@ -209,7 +205,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
 
                     // Apply a relative error of 4ULP to introduce some non-determinism
                     // simulating imprecise implementations and optimizations.
-                    let res = apply_random_float_error_ulp(
+                    let res = math::apply_random_float_error_ulp(
                         this,
                         res,
                         2, // log2(4)
@@ -217,7 +213,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
 
                     // Clamp the result to the guaranteed range of this function according to the C standard,
                     // if any.
-                    clamp_float_value(intrinsic_name, res)
+                    math::clamp_float_value(intrinsic_name, res)
                 });
                 let res = this.adjust_nan(res, &[f]);
                 this.write_scalar(res, dest)?;
@@ -235,7 +231,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let [f] = check_intrinsic_arg_count(args)?;
                 let f = this.read_scalar(f)?.to_f64()?;
 
-                let res = fixed_float_value(this, intrinsic_name, &[f]).unwrap_or_else(|| {
+                let res = math::fixed_float_value(this, intrinsic_name, &[f]).unwrap_or_else(|| {
                     // Using host floats (but it's fine, these operations do not have
                     // guaranteed precision).
                     let host = f.to_host();
@@ -253,7 +249,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
 
                     // Apply a relative error of 4ULP to introduce some non-determinism
                     // simulating imprecise implementations and optimizations.
-                    let res = apply_random_float_error_ulp(
+                    let res = math::apply_random_float_error_ulp(
                         this,
                         res,
                         2, // log2(4)
@@ -261,7 +257,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
 
                     // Clamp the result to the guaranteed range of this function according to the C standard,
                     // if any.
-                    clamp_float_value(intrinsic_name, res)
+                    math::clamp_float_value(intrinsic_name, res)
                 });
                 let res = this.adjust_nan(res, &[f]);
                 this.write_scalar(res, dest)?;
@@ -312,16 +308,17 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let f1 = this.read_scalar(f1)?.to_f32()?;
                 let f2 = this.read_scalar(f2)?.to_f32()?;
 
-                let res = fixed_float_value(this, intrinsic_name, &[f1, f2]).unwrap_or_else(|| {
-                    // Using host floats (but it's fine, this operation does not have guaranteed precision).
-                    let res = f1.to_host().powf(f2.to_host()).to_soft();
+                let res =
+                    math::fixed_float_value(this, intrinsic_name, &[f1, f2]).unwrap_or_else(|| {
+                        // Using host floats (but it's fine, this operation does not have guaranteed precision).
+                        let res = f1.to_host().powf(f2.to_host()).to_soft();
 
-                    // Apply a relative error of 4ULP to introduce some non-determinism
-                    // simulating imprecise implementations and optimizations.
-                    apply_random_float_error_ulp(
-                        this, res, 2, // log2(4)
-                    )
-                });
+                        // Apply a relative error of 4ULP to introduce some non-determinism
+                        // simulating imprecise implementations and optimizations.
+                        math::apply_random_float_error_ulp(
+                            this, res, 2, // log2(4)
+                        )
+                    });
                 let res = this.adjust_nan(res, &[f1, f2]);
                 this.write_scalar(res, dest)?;
             }
@@ -330,16 +327,17 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let f1 = this.read_scalar(f1)?.to_f64()?;
                 let f2 = this.read_scalar(f2)?.to_f64()?;
 
-                let res = fixed_float_value(this, intrinsic_name, &[f1, f2]).unwrap_or_else(|| {
-                    // Using host floats (but it's fine, this operation does not have guaranteed precision).
-                    let res = f1.to_host().powf(f2.to_host()).to_soft();
+                let res =
+                    math::fixed_float_value(this, intrinsic_name, &[f1, f2]).unwrap_or_else(|| {
+                        // Using host floats (but it's fine, this operation does not have guaranteed precision).
+                        let res = f1.to_host().powf(f2.to_host()).to_soft();
 
-                    // Apply a relative error of 4ULP to introduce some non-determinism
-                    // simulating imprecise implementations and optimizations.
-                    apply_random_float_error_ulp(
-                        this, res, 2, // log2(4)
-                    )
-                });
+                        // Apply a relative error of 4ULP to introduce some non-determinism
+                        // simulating imprecise implementations and optimizations.
+                        math::apply_random_float_error_ulp(
+                            this, res, 2, // log2(4)
+                        )
+                    });
                 let res = this.adjust_nan(res, &[f1, f2]);
                 this.write_scalar(res, dest)?;
             }
@@ -349,13 +347,13 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let f = this.read_scalar(f)?.to_f32()?;
                 let i = this.read_scalar(i)?.to_i32()?;
 
-                let res = fixed_powi_float_value(this, f, i).unwrap_or_else(|| {
+                let res = math::fixed_powi_value(this, f, i).unwrap_or_else(|| {
                     // Using host floats (but it's fine, this operation does not have guaranteed precision).
                     let res = f.to_host().powi(i).to_soft();
 
                     // Apply a relative error of 4ULP to introduce some non-determinism
                     // simulating imprecise implementations and optimizations.
-                    apply_random_float_error_ulp(
+                    math::apply_random_float_error_ulp(
                         this, res, 2, // log2(4)
                     )
                 });
@@ -367,13 +365,13 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let f = this.read_scalar(f)?.to_f64()?;
                 let i = this.read_scalar(i)?.to_i32()?;
 
-                let res = fixed_powi_float_value(this, f, i).unwrap_or_else(|| {
+                let res = math::fixed_powi_value(this, f, i).unwrap_or_else(|| {
                     // Using host floats (but it's fine, this operation does not have guaranteed precision).
                     let res = f.to_host().powi(i).to_soft();
 
                     // Apply a relative error of 4ULP to introduce some non-determinism
                     // simulating imprecise implementations and optimizations.
-                    apply_random_float_error_ulp(
+                    math::apply_random_float_error_ulp(
                         this, res, 2, // log2(4)
                     )
                 });
@@ -430,7 +428,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 }
                 // Apply a relative error of 4ULP to simulate non-deterministic precision loss
                 // due to optimizations.
-                let res = apply_random_float_error_to_imm(this, res, 2 /* log2(4) */)?;
+                let res = math::apply_random_float_error_to_imm(this, res, 2 /* log2(4) */)?;
                 this.write_immediate(*res, dest)?;
             }
 
@@ -467,133 +465,3 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
         interp_ok(EmulateItemResult::NeedsReturn)
     }
 }
-
-/// Applies a random ULP floating point error to `val` and returns the new value.
-/// So if you want an X ULP error, `ulp_exponent` should be log2(X).
-///
-/// Will fail if `val` is not a floating point number.
-fn apply_random_float_error_to_imm<'tcx>(
-    ecx: &mut MiriInterpCx<'tcx>,
-    val: ImmTy<'tcx>,
-    ulp_exponent: u32,
-) -> InterpResult<'tcx, ImmTy<'tcx>> {
-    let scalar = val.to_scalar_int()?;
-    let res: ScalarInt = match val.layout.ty.kind() {
-        ty::Float(FloatTy::F16) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f16(), ulp_exponent).into(),
-        ty::Float(FloatTy::F32) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f32(), ulp_exponent).into(),
-        ty::Float(FloatTy::F64) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f64(), ulp_exponent).into(),
-        ty::Float(FloatTy::F128) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f128(), ulp_exponent).into(),
-        _ => bug!("intrinsic called with non-float input type"),
-    };
-
-    interp_ok(ImmTy::from_scalar_int(res, val.layout))
-}
-
-/// For the intrinsics:
-/// - sinf32, sinf64
-/// - cosf32, cosf64
-/// - expf32, expf64, exp2f32, exp2f64
-/// - logf32, logf64, log2f32, log2f64, log10f32, log10f64
-/// - powf32, powf64
-///
-/// # Return
-///
-/// Returns `Some(output)` if the `intrinsic` results in a defined fixed `output` specified in the C standard
-/// (specifically, C23 annex F.10)  when given `args` as arguments. Outputs that are unaffected by a relative error
-/// (such as INF and zero) are not handled here, they are assumed to be handled by the underlying
-/// implementation. Returns `None` if no specific value is guaranteed.
-///
-/// # Note
-///
-/// For `powf*` operations of the form:
-///
-/// - `(SNaN)^(±0)`
-/// - `1^(SNaN)`
-///
-/// The result is implementation-defined:
-/// - musl returns for both `1.0`
-/// - glibc returns for both `NaN`
-///
-/// This discrepancy exists because SNaN handling is not consistently defined across platforms,
-/// and the C standard leaves behavior for SNaNs unspecified.
-///
-/// Miri chooses to adhere to both implementations and returns either one of them non-deterministically.
-fn fixed_float_value<S: Semantics>(
-    ecx: &mut MiriInterpCx<'_>,
-    intrinsic_name: &str,
-    args: &[IeeeFloat<S>],
-) -> Option<IeeeFloat<S>> {
-    let one = IeeeFloat::<S>::one();
-    Some(match (intrinsic_name, args) {
-        // cos(+- 0) = 1
-        ("cosf32" | "cosf64", [input]) if input.is_zero() => one,
-
-        // e^0 = 1
-        ("expf32" | "expf64" | "exp2f32" | "exp2f64", [input]) if input.is_zero() => one,
-
-        // (-1)^(±INF) = 1
-        ("powf32" | "powf64", [base, exp]) if *base == -one && exp.is_infinite() => one,
-
-        // 1^y = 1 for any y, even a NaN
-        ("powf32" | "powf64", [base, exp]) if *base == one => {
-            let rng = ecx.machine.rng.get_mut();
-            // SNaN exponents get special treatment: they might return 1, or a NaN.
-            let return_nan = exp.is_signaling() && ecx.machine.float_nondet && rng.random();
-            // Handle both the musl and glibc cases non-deterministically.
-            if return_nan { ecx.generate_nan(args) } else { one }
-        }
-
-        // x^(±0) = 1 for any x, even a NaN
-        ("powf32" | "powf64", [base, exp]) if exp.is_zero() => {
-            let rng = ecx.machine.rng.get_mut();
-            // SNaN bases get special treatment: they might return 1, or a NaN.
-            let return_nan = base.is_signaling() && ecx.machine.float_nondet && rng.random();
-            // Handle both the musl and glibc cases non-deterministically.
-            if return_nan { ecx.generate_nan(args) } else { one }
-        }
-
-        // There are a lot of cases for fixed outputs according to the C Standard, but these are
-        // mainly INF or zero which are not affected by the applied error.
-        _ => return None,
-    })
-}
-
-/// Returns `Some(output)` if `powi` (called `pown` in C) results in a fixed value specified in the
-/// C standard (specifically, C23 annex F.10.4.6) when doing `base^exp`. Otherwise, returns `None`.
-fn fixed_powi_float_value<S: Semantics>(
-    ecx: &mut MiriInterpCx<'_>,
-    base: IeeeFloat<S>,
-    exp: i32,
-) -> Option<IeeeFloat<S>> {
-    Some(match exp {
-        0 => {
-            let one = IeeeFloat::<S>::one();
-            let rng = ecx.machine.rng.get_mut();
-            let return_nan = ecx.machine.float_nondet && rng.random() && base.is_signaling();
-            // For SNaN treatment, we are consistent with `powf`above.
-            // (We wouldn't have two, unlike powf all implementations seem to agree for powi,
-            // but for now we are maximally conservative.)
-            if return_nan { ecx.generate_nan(&[base]) } else { one }
-        }
-
-        _ => return None,
-    })
-}
-
-/// Given an floating-point operation and a floating-point value, clamps the result to the output
-/// range of the given operation.
-fn clamp_float_value<S: Semantics>(intrinsic_name: &str, val: IeeeFloat<S>) -> IeeeFloat<S> {
-    match intrinsic_name {
-        // sin and cos: [-1, 1]
-        "sinf32" | "cosf32" | "sinf64" | "cosf64" =>
-            val.clamp(IeeeFloat::<S>::one().neg(), IeeeFloat::<S>::one()),
-        // exp: [0, +INF]
-        "expf32" | "exp2f32" | "expf64" | "exp2f64" =>
-            IeeeFloat::<S>::maximum(val, IeeeFloat::<S>::ZERO),
-        _ => val,
-    }
-}
diff --git a/src/tools/miri/src/lib.rs b/src/tools/miri/src/lib.rs
index a591d21071db6..c9086922a51fe 100644
--- a/src/tools/miri/src/lib.rs
+++ b/src/tools/miri/src/lib.rs
@@ -16,6 +16,8 @@
 #![feature(derive_coerce_pointee)]
 #![feature(arbitrary_self_types)]
 #![feature(iter_advance_by)]
+#![feature(f16)]
+#![feature(f128)]
 // Configure clippy and other lints
 #![allow(
     clippy::collapsible_else_if,
diff --git a/src/tools/miri/src/math.rs b/src/tools/miri/src/math.rs
index cf16a5676d680..6a86e4276f253 100644
--- a/src/tools/miri/src/math.rs
+++ b/src/tools/miri/src/math.rs
@@ -1,6 +1,9 @@
+use std::ops::Neg;
+use std::{f16, f32, f64, f128};
+
 use rand::Rng as _;
 use rustc_apfloat::Float as _;
-use rustc_apfloat::ieee::IeeeFloat;
+use rustc_apfloat::ieee::{DoubleS, HalfS, IeeeFloat, QuadS, Semantics, SingleS};
 use rustc_middle::ty::{self, FloatTy, ScalarInt};
 
 use crate::*;
@@ -50,29 +53,249 @@ pub(crate) fn apply_random_float_error_ulp<F: rustc_apfloat::Float>(
     apply_random_float_error(ecx, val, err_scale)
 }
 
-/// Applies a random 16ULP floating point error to `val` and returns the new value.
+/// Applies a random ULP floating point error to `val` and returns the new value.
+/// So if you want an X ULP error, `ulp_exponent` should be log2(X).
+///
 /// Will fail if `val` is not a floating point number.
 pub(crate) fn apply_random_float_error_to_imm<'tcx>(
     ecx: &mut MiriInterpCx<'tcx>,
     val: ImmTy<'tcx>,
     ulp_exponent: u32,
 ) -> InterpResult<'tcx, ImmTy<'tcx>> {
+    let this = ecx.eval_context_mut();
     let scalar = val.to_scalar_int()?;
     let res: ScalarInt = match val.layout.ty.kind() {
         ty::Float(FloatTy::F16) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f16(), ulp_exponent).into(),
+            apply_random_float_error_ulp(this, scalar.to_f16(), ulp_exponent).into(),
         ty::Float(FloatTy::F32) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f32(), ulp_exponent).into(),
+            apply_random_float_error_ulp(this, scalar.to_f32(), ulp_exponent).into(),
         ty::Float(FloatTy::F64) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f64(), ulp_exponent).into(),
+            apply_random_float_error_ulp(this, scalar.to_f64(), ulp_exponent).into(),
         ty::Float(FloatTy::F128) =>
-            apply_random_float_error_ulp(ecx, scalar.to_f128(), ulp_exponent).into(),
+            apply_random_float_error_ulp(this, scalar.to_f128(), ulp_exponent).into(),
         _ => bug!("intrinsic called with non-float input type"),
     };
 
     interp_ok(ImmTy::from_scalar_int(res, val.layout))
 }
 
+/// Given a floating-point operation and a floating-point value, clamps the result to the output
+/// range of the given operation according to the C standard, if any.
+pub(crate) fn clamp_float_value<S: Semantics>(
+    intrinsic_name: &str,
+    val: IeeeFloat<S>,
+) -> IeeeFloat<S>
+where
+    IeeeFloat<S>: IeeeExt,
+{
+    let zero = IeeeFloat::<S>::ZERO;
+    let one = IeeeFloat::<S>::one();
+    let two = IeeeFloat::<S>::two();
+    let pi = IeeeFloat::<S>::pi();
+    let pi_over_2 = (pi / two).value;
+
+    match intrinsic_name {
+        // sin, cos, tanh: [-1, 1]
+        #[rustfmt::skip]
+        | "sinf32"
+        | "sinf64"
+        | "cosf32"
+        | "cosf64"
+        | "tanhf"
+        | "tanh"
+         => val.clamp(one.neg(), one),
+
+        // exp: [0, +INF)
+        "expf32" | "exp2f32" | "expf64" | "exp2f64" => val.maximum(zero),
+
+        // cosh: [1, +INF)
+        "coshf" | "cosh" => val.maximum(one),
+
+        // acos: [0, π]
+        "acosf" | "acos" => val.clamp(zero, pi),
+
+        // asin: [-π, +π]
+        "asinf" | "asin" => val.clamp(pi.neg(), pi),
+
+        // atan: (-π/2, +π/2)
+        "atanf" | "atan" => val.clamp(pi_over_2.neg(), pi_over_2),
+
+        // erfc: (-1, 1)
+        "erff" | "erf" => val.clamp(one.neg(), one),
+
+        // erfc: (0, 2)
+        "erfcf" | "erfc" => val.clamp(zero, two),
+
+        // atan2(y, x): arctan(y/x) in [−π, +π]
+        "atan2f" | "atan2" => val.clamp(pi.neg(), pi),
+
+        _ => val,
+    }
+}
+
+/// For the intrinsics:
+/// - sinf32, sinf64, sinhf, sinh
+/// - cosf32, cosf64, coshf, cosh
+/// - tanhf, tanh, atanf, atan, atan2f, atan2
+/// - expf32, expf64, exp2f32, exp2f64
+/// - logf32, logf64, log2f32, log2f64, log10f32, log10f64
+/// - powf32, powf64
+/// - erff, erf, erfcf, erfc
+/// - hypotf, hypot
+///
+/// # Return
+///
+/// Returns `Some(output)` if the `intrinsic` results in a defined fixed `output` specified in the C standard
+/// (specifically, C23 annex F.10)  when given `args` as arguments. Outputs that are unaffected by a relative error
+/// (such as INF and zero) are not handled here, they are assumed to be handled by the underlying
+/// implementation. Returns `None` if no specific value is guaranteed.
+///
+/// # Note
+///
+/// For `powf*` operations of the form:
+///
+/// - `(SNaN)^(±0)`
+/// - `1^(SNaN)`
+///
+/// The result is implementation-defined:
+/// - musl returns for both `1.0`
+/// - glibc returns for both `NaN`
+///
+/// This discrepancy exists because SNaN handling is not consistently defined across platforms,
+/// and the C standard leaves behavior for SNaNs unspecified.
+///
+/// Miri chooses to adhere to both implementations and returns either one of them non-deterministically.
+pub(crate) fn fixed_float_value<S: Semantics>(
+    ecx: &mut MiriInterpCx<'_>,
+    intrinsic_name: &str,
+    args: &[IeeeFloat<S>],
+) -> Option<IeeeFloat<S>>
+where
+    IeeeFloat<S>: IeeeExt,
+{
+    let this = ecx.eval_context_mut();
+    let one = IeeeFloat::<S>::one();
+    let two = IeeeFloat::<S>::two();
+    let three = IeeeFloat::<S>::three();
+    let pi = IeeeFloat::<S>::pi();
+    let pi_over_2 = (pi / two).value;
+    let pi_over_4 = (pi_over_2 / two).value;
+
+    Some(match (intrinsic_name, args) {
+        // cos(±0) and cosh(±0)= 1
+        ("cosf32" | "cosf64" | "coshf" | "cosh", [input]) if input.is_zero() => one,
+
+        // e^0 = 1
+        ("expf32" | "expf64" | "exp2f32" | "exp2f64", [input]) if input.is_zero() => one,
+
+        // tanh(±INF) = ±1
+        ("tanhf" | "tanh", [input]) if input.is_infinite() => one.copy_sign(*input),
+
+        // atan(±INF) = ±π/2
+        ("atanf" | "atan", [input]) if input.is_infinite() => pi_over_2.copy_sign(*input),
+
+        // erf(±INF) = ±1
+        ("erff" | "erf", [input]) if input.is_infinite() => one.copy_sign(*input),
+
+        // erfc(-INF) = 2
+        ("erfcf" | "erfc", [input]) if input.is_neg_infinity() => (one + one).value,
+
+        // hypot(x, ±0) = abs(x), if x is not a NaN.
+        ("_hypotf" | "hypotf" | "_hypot" | "hypot", [x, y]) if !x.is_nan() && y.is_zero() =>
+            x.abs(),
+
+        // atan2(±0,−0) = ±π.
+        // atan2(±0, y) = ±π for y < 0.
+        // Must check for non NaN because `y.is_negative()` also applies to NaN.
+        ("atan2f" | "atan2", [x, y]) if (x.is_zero() && (y.is_negative() && !y.is_nan())) =>
+            pi.copy_sign(*x),
+
+        // atan2(±x,−∞) = ±π for finite x > 0.
+        ("atan2f" | "atan2", [x, y])
+            if (!x.is_zero() && !x.is_infinite()) && y.is_neg_infinity() =>
+            pi.copy_sign(*x),
+
+        // atan2(x, ±0) = −π/2 for x < 0.
+        // atan2(x, ±0) =  π/2 for x > 0.
+        ("atan2f" | "atan2", [x, y]) if !x.is_zero() && y.is_zero() => pi_over_2.copy_sign(*x),
+
+        //atan2(±∞, −∞) = ±3π/4
+        ("atan2f" | "atan2", [x, y]) if x.is_infinite() && y.is_neg_infinity() =>
+            (pi_over_4 * three).value.copy_sign(*x),
+
+        //atan2(±∞, +∞) = ±π/4
+        ("atan2f" | "atan2", [x, y]) if x.is_infinite() && y.is_pos_infinity() =>
+            pi_over_4.copy_sign(*x),
+
+        // atan2(±∞, y) returns ±π/2 for finite y.
+        ("atan2f" | "atan2", [x, y]) if x.is_infinite() && (!y.is_infinite() && !y.is_nan()) =>
+            pi_over_2.copy_sign(*x),
+
+        // (-1)^(±INF) = 1
+        ("powf32" | "powf64", [base, exp]) if *base == -one && exp.is_infinite() => one,
+
+        // 1^y = 1 for any y, even a NaN
+        ("powf32" | "powf64", [base, exp]) if *base == one => {
+            let rng = this.machine.rng.get_mut();
+            // SNaN exponents get special treatment: they might return 1, or a NaN.
+            let return_nan = exp.is_signaling() && this.machine.float_nondet && rng.random();
+            // Handle both the musl and glibc cases non-deterministically.
+            if return_nan { this.generate_nan(args) } else { one }
+        }
+
+        // x^(±0) = 1 for any x, even a NaN
+        ("powf32" | "powf64", [base, exp]) if exp.is_zero() => {
+            let rng = this.machine.rng.get_mut();
+            // SNaN bases get special treatment: they might return 1, or a NaN.
+            let return_nan = base.is_signaling() && this.machine.float_nondet && rng.random();
+            // Handle both the musl and glibc cases non-deterministically.
+            if return_nan { this.generate_nan(args) } else { one }
+        }
+
+        // There are a lot of cases for fixed outputs according to the C Standard, but these are
+        // mainly INF or zero which are not affected by the applied error.
+        _ => return None,
+    })
+}
+
+/// Returns `Some(output)` if `powi` (called `pown` in C) results in a fixed value specified in the
+/// C standard (specifically, C23 annex F.10.4.6) when doing `base^exp`. Otherwise, returns `None`.
+pub(crate) fn fixed_powi_value<S: Semantics>(
+    ecx: &mut MiriInterpCx<'_>,
+    base: IeeeFloat<S>,
+    exp: i32,
+) -> Option<IeeeFloat<S>>
+where
+    IeeeFloat<S>: IeeeExt,
+{
+    match exp {
+        0 => {
+            let one = IeeeFloat::<S>::one();
+            let rng = ecx.machine.rng.get_mut();
+            let return_nan = ecx.machine.float_nondet && rng.random() && base.is_signaling();
+            // For SNaN treatment, we are consistent with `powf`above.
+            // (We wouldn't have two, unlike powf all implementations seem to agree for powi,
+            // but for now we are maximally conservative.)
+            Some(if return_nan { ecx.generate_nan(&[base]) } else { one })
+        }
+
+        _ => return None,
+    }
+}
+
+/// Returns `Some(output)` if `scalbn` (called `ldexp` in C) results in a fixed value specified in the
+/// C standard (specifically, C23 annex F.10.3.19) when doing `arg*(2^exp)`. Otherwise, returns `None`.
+pub(crate) fn fixed_scalbn_value<S: Semantics>(
+    arg: IeeeFloat<S>,
+    exp: i32,
+) -> Option<IeeeFloat<S>> {
+    match exp {
+        // ldexp(x, 0) = x.
+        0 => Some(arg),
+        _ => return None,
+    }
+}
+
 pub(crate) fn sqrt<S: rustc_apfloat::ieee::Semantics>(x: IeeeFloat<S>) -> IeeeFloat<S> {
     match x.category() {
         // preserve zero sign
@@ -155,19 +378,49 @@ pub(crate) fn sqrt<S: rustc_apfloat::ieee::Semantics>(x: IeeeFloat<S>) -> IeeeFl
     }
 }
 
-/// Extend functionality of rustc_apfloat softfloats
+/// Extend functionality of `rustc_apfloat` softfloats for IEEE float types.
 pub trait IeeeExt: rustc_apfloat::Float {
+    // Some values we use:
+
     #[inline]
     fn one() -> Self {
         Self::from_u128(1).value
     }
 
+    #[inline]
+    fn two() -> Self {
+        Self::from_u128(2).value
+    }
+
+    #[inline]
+    fn three() -> Self {
+        Self::from_u128(3).value
+    }
+
+    fn pi() -> Self;
+
     #[inline]
     fn clamp(self, min: Self, max: Self) -> Self {
         self.maximum(min).minimum(max)
     }
 }
-impl<S: rustc_apfloat::ieee::Semantics> IeeeExt for IeeeFloat<S> {}
+
+macro_rules! impl_ieee_pi {
+    ($float_ty:ident, $semantic:ty) => {
+        impl IeeeExt for IeeeFloat<$semantic> {
+            #[inline]
+            fn pi() -> Self {
+                // We take the value from the standard library as the most reasonable source for an exact π here.
+                Self::from_bits($float_ty::consts::PI.to_bits() as _)
+            }
+        }
+    };
+}
+
+impl_ieee_pi!(f16, HalfS);
+impl_ieee_pi!(f32, SingleS);
+impl_ieee_pi!(f64, DoubleS);
+impl_ieee_pi!(f128, QuadS);
 
 #[cfg(test)]
 mod tests {
diff --git a/src/tools/miri/src/shims/foreign_items.rs b/src/tools/miri/src/shims/foreign_items.rs
index 9ddba8c2b48d8..907a858a15f6c 100644
--- a/src/tools/miri/src/shims/foreign_items.rs
+++ b/src/tools/miri/src/shims/foreign_items.rs
@@ -17,6 +17,7 @@ use rustc_target::callconv::FnAbi;
 use self::helpers::{ToHost, ToSoft};
 use super::alloc::EvalContextExt as _;
 use super::backtrace::EvalContextExt as _;
+use crate::helpers::EvalContextExt as _;
 use crate::*;
 
 /// Type of dynamic symbols (for `dlsym` et al)
@@ -766,33 +767,38 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
             => {
                 let [f] = this.check_shim(abi, CanonAbi::C , link_name, args)?;
                 let f = this.read_scalar(f)?.to_f32()?;
-                // Using host floats (but it's fine, these operations do not have guaranteed precision).
-                let f_host = f.to_host();
-                let res = match link_name.as_str() {
-                    "cbrtf" => f_host.cbrt(),
-                    "coshf" => f_host.cosh(),
-                    "sinhf" => f_host.sinh(),
-                    "tanf" => f_host.tan(),
-                    "tanhf" => f_host.tanh(),
-                    "acosf" => f_host.acos(),
-                    "asinf" => f_host.asin(),
-                    "atanf" => f_host.atan(),
-                    "log1pf" => f_host.ln_1p(),
-                    "expm1f" => f_host.exp_m1(),
-                    "tgammaf" => f_host.gamma(),
-                    "erff" => f_host.erf(),
-                    "erfcf" => f_host.erfc(),
-                    _ => bug!(),
-                };
-                let res = res.to_soft();
-                // Apply a relative error of 16ULP to introduce some non-determinism
-                // simulating imprecise implementations and optimizations.
-                // FIXME: temporarily disabled as it breaks std tests.
-                // let res = math::apply_random_float_error_ulp(
-                //     this,
-                //     res,
-                //     4, // log2(16)
-                // );
+
+                let res = math::fixed_float_value(this, link_name.as_str(), &[f]).unwrap_or_else(|| {
+                    // Using host floats (but it's fine, these operations do not have
+                    // guaranteed precision).
+                    let f_host = f.to_host();
+                    let res = match link_name.as_str() {
+                        "cbrtf" => f_host.cbrt(),
+                        "coshf" => f_host.cosh(),
+                        "sinhf" => f_host.sinh(),
+                        "tanf" => f_host.tan(),
+                        "tanhf" => f_host.tanh(),
+                        "acosf" => f_host.acos(),
+                        "asinf" => f_host.asin(),
+                        "atanf" => f_host.atan(),
+                        "log1pf" => f_host.ln_1p(),
+                        "expm1f" => f_host.exp_m1(),
+                        "tgammaf" => f_host.gamma(),
+                        "erff" => f_host.erf(),
+                        "erfcf" => f_host.erfc(),
+                        _ => bug!(),
+                    };
+                    let res = res.to_soft();
+                    // Apply a relative error of 4ULP to introduce some non-determinism
+                    // simulating imprecise implementations and optimizations.
+                    let res = math::apply_random_float_error_ulp(
+                        this, res, 2, // log2(4)
+                    );
+
+                    // Clamp the result to the guaranteed range of this function according to the C standard,
+                    // if any.
+                    math::clamp_float_value(link_name.as_str(), res)
+                });
                 let res = this.adjust_nan(res, &[f]);
                 this.write_scalar(res, dest)?;
             }
@@ -805,24 +811,28 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let [f1, f2] = this.check_shim(abi, CanonAbi::C , link_name, args)?;
                 let f1 = this.read_scalar(f1)?.to_f32()?;
                 let f2 = this.read_scalar(f2)?.to_f32()?;
-                // underscore case for windows, here and below
-                // (see https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/floating-point-primitives?view=vs-2019)
-                // Using host floats (but it's fine, these operations do not have guaranteed precision).
-                let res = match link_name.as_str() {
-                    "_hypotf" | "hypotf" => f1.to_host().hypot(f2.to_host()).to_soft(),
-                    "atan2f" => f1.to_host().atan2(f2.to_host()).to_soft(),
-                    #[allow(deprecated)]
-                    "fdimf" => f1.to_host().abs_sub(f2.to_host()).to_soft(),
-                    _ => bug!(),
-                };
-                // Apply a relative error of 16ULP to introduce some non-determinism
-                // simulating imprecise implementations and optimizations.
-                // FIXME: temporarily disabled as it breaks std tests.
-                // let res = math::apply_random_float_error_ulp(
-                //     this,
-                //     res,
-                //     4, // log2(16)
-                // );
+
+                let res = math::fixed_float_value(this, link_name.as_str(), &[f1, f2]).unwrap_or_else(|| {
+                    let res = match link_name.as_str() {
+                        // underscore case for windows, here and below
+                        // (see https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/floating-point-primitives?view=vs-2019)
+                        // Using host floats (but it's fine, these operations do not have guaranteed precision).
+                        "_hypotf" | "hypotf" => f1.to_host().hypot(f2.to_host()).to_soft(),
+                        "atan2f" => f1.to_host().atan2(f2.to_host()).to_soft(),
+                        #[allow(deprecated)]
+                        "fdimf" => f1.to_host().abs_sub(f2.to_host()).to_soft(),
+                        _ => bug!(),
+                    };
+                    // Apply a relative error of 4ULP to introduce some non-determinism
+                    // simulating imprecise implementations and optimizations.
+                    let res = math::apply_random_float_error_ulp(
+                        this, res, 2, // log2(4)
+                    );
+
+                    // Clamp the result to the guaranteed range of this function according to the C standard,
+                    // if any.
+                    math::clamp_float_value(link_name.as_str(), res)
+                });
                 let res = this.adjust_nan(res, &[f1, f2]);
                 this.write_scalar(res, dest)?;
             }
@@ -843,33 +853,38 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
             => {
                 let [f] = this.check_shim(abi, CanonAbi::C , link_name, args)?;
                 let f = this.read_scalar(f)?.to_f64()?;
-                // Using host floats (but it's fine, these operations do not have guaranteed precision).
-                let f_host = f.to_host();
-                let res = match link_name.as_str() {
-                    "cbrt" => f_host.cbrt(),
-                    "cosh" => f_host.cosh(),
-                    "sinh" => f_host.sinh(),
-                    "tan" => f_host.tan(),
-                    "tanh" => f_host.tanh(),
-                    "acos" => f_host.acos(),
-                    "asin" => f_host.asin(),
-                    "atan" => f_host.atan(),
-                    "log1p" => f_host.ln_1p(),
-                    "expm1" => f_host.exp_m1(),
-                    "tgamma" => f_host.gamma(),
-                    "erf" => f_host.erf(),
-                    "erfc" => f_host.erfc(),
-                    _ => bug!(),
-                };
-                let res = res.to_soft();
-                // Apply a relative error of 16ULP to introduce some non-determinism
-                // simulating imprecise implementations and optimizations.
-                // FIXME: temporarily disabled as it breaks std tests.
-                // let res = math::apply_random_float_error_ulp(
-                //     this,
-                //     res.to_soft(),
-                //     4, // log2(16)
-                // );
+
+                let res = math::fixed_float_value(this, link_name.as_str(), &[f]).unwrap_or_else(|| {
+                    // Using host floats (but it's fine, these operations do not have
+                    // guaranteed precision).
+                    let f_host = f.to_host();
+                    let res = match link_name.as_str() {
+                        "cbrt" => f_host.cbrt(),
+                        "cosh" => f_host.cosh(),
+                        "sinh" => f_host.sinh(),
+                        "tan" => f_host.tan(),
+                        "tanh" => f_host.tanh(),
+                        "acos" => f_host.acos(),
+                        "asin" => f_host.asin(),
+                        "atan" => f_host.atan(),
+                        "log1p" => f_host.ln_1p(),
+                        "expm1" => f_host.exp_m1(),
+                        "tgamma" => f_host.gamma(),
+                        "erf" => f_host.erf(),
+                        "erfc" => f_host.erfc(),
+                        _ => bug!(),
+                    };
+                    let res = res.to_soft();
+                    // Apply a relative error of 4ULP to introduce some non-determinism
+                    // simulating imprecise implementations and optimizations.
+                    let res = math::apply_random_float_error_ulp(
+                        this, res, 2, // log2(4)
+                    );
+
+                    // Clamp the result to the guaranteed range of this function according to the C standard,
+                    // if any.
+                    math::clamp_float_value(link_name.as_str(), res)
+                });
                 let res = this.adjust_nan(res, &[f]);
                 this.write_scalar(res, dest)?;
             }
@@ -882,24 +897,28 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let [f1, f2] = this.check_shim(abi, CanonAbi::C , link_name, args)?;
                 let f1 = this.read_scalar(f1)?.to_f64()?;
                 let f2 = this.read_scalar(f2)?.to_f64()?;
-                // underscore case for windows, here and below
-                // (see https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/floating-point-primitives?view=vs-2019)
-                // Using host floats (but it's fine, these operations do not have guaranteed precision).
-                let res = match link_name.as_str() {
-                    "_hypot" | "hypot" => f1.to_host().hypot(f2.to_host()).to_soft(),
-                    "atan2" => f1.to_host().atan2(f2.to_host()).to_soft(),
-                    #[allow(deprecated)]
-                    "fdim" => f1.to_host().abs_sub(f2.to_host()).to_soft(),
-                    _ => bug!(),
-                };
-                // Apply a relative error of 16ULP to introduce some non-determinism
-                // simulating imprecise implementations and optimizations.
-                // FIXME: temporarily disabled as it breaks std tests.
-                // let res = math::apply_random_float_error_ulp(
-                //     this,
-                //     res,
-                //     4, // log2(16)
-                // );
+
+                let res = math::fixed_float_value(this, link_name.as_str(), &[f1, f2]).unwrap_or_else(|| {
+                    let res = match link_name.as_str() {
+                        // underscore case for windows, here and below
+                        // (see https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/floating-point-primitives?view=vs-2019)
+                        // Using host floats (but it's fine, these operations do not have guaranteed precision).
+                        "_hypot" | "hypot" => f1.to_host().hypot(f2.to_host()).to_soft(),
+                        "atan2" => f1.to_host().atan2(f2.to_host()).to_soft(),
+                        #[allow(deprecated)]
+                        "fdim" => f1.to_host().abs_sub(f2.to_host()).to_soft(),
+                        _ => bug!(),
+                    };
+                    // Apply a relative error of 4ULP to introduce some non-determinism
+                    // simulating imprecise implementations and optimizations.
+                    let res = math::apply_random_float_error_ulp(
+                        this, res, 2, // log2(4)
+                    );
+
+                    // Clamp the result to the guaranteed range of this function according to the C standard,
+                    // if any.
+                    math::clamp_float_value(link_name.as_str(), res)
+                });
                 let res = this.adjust_nan(res, &[f1, f2]);
                 this.write_scalar(res, dest)?;
             }
@@ -913,7 +932,13 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 let x = this.read_scalar(x)?.to_f64()?;
                 let exp = this.read_scalar(exp)?.to_i32()?;
 
-                let res = x.scalbn(exp);
+                let res =
+                    math::fixed_scalbn_value(x, exp).unwrap_or_else(|| {
+                        let res = x.scalbn(exp);
+                        // Apply a relative error of 4ULP to introduce some non-determinism
+                        // simulating imprecise implementations and optimizations.
+                        math::apply_random_float_error_ulp(this, res, 2 /* log2(4) */)
+                    });
                 let res = this.adjust_nan(res, &[x]);
                 this.write_scalar(res, dest)?;
             }
@@ -925,11 +950,14 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 // Using host floats (but it's fine, these operations do not have guaranteed precision).
                 let (res, sign) = x.to_host().ln_gamma();
                 this.write_int(sign, &signp)?;
+
                 let res = res.to_soft();
-                // Apply a relative error of 16ULP to introduce some non-determinism
+                // Apply a relative error of 4ULP to introduce some non-determinism
                 // simulating imprecise implementations and optimizations.
-                // FIXME: temporarily disabled as it breaks std tests.
-                // let res = math::apply_random_float_error_ulp(this, res, 4 /* log2(16) */);
+                let res = math::apply_random_float_error_ulp(this, res, 2 /* log2(4) */);
+                // Clamp the result to the guaranteed range of this function according to the C standard,
+                // if any.
+                let res = math::clamp_float_value(link_name.as_str(), res);
                 let res = this.adjust_nan(res, &[x]);
                 this.write_scalar(res, dest)?;
             }
@@ -941,11 +969,14 @@ trait EvalContextExtPriv<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 // Using host floats (but it's fine, these operations do not have guaranteed precision).
                 let (res, sign) = x.to_host().ln_gamma();
                 this.write_int(sign, &signp)?;
+
                 let res = res.to_soft();
-                // Apply a relative error of 16ULP to introduce some non-determinism
+                // Apply a relative error of 4ULP to introduce some non-determinism
                 // simulating imprecise implementations and optimizations.
-                // FIXME: temporarily disabled as it breaks std tests.
-                // let res = math::apply_random_float_error_ulp(this, res, 4 /* log2(16) */);
+                let res = math::apply_random_float_error_ulp(this, res, 2 /* log2(4) */);
+                // Clamp the result to the guaranteed range of this function according to the C standard,
+                // if any.
+                let res = math::clamp_float_value(link_name.as_str(), res);
                 let res = this.adjust_nan(res, &[x]);
                 this.write_scalar(res, dest)?;
             }
diff --git a/src/tools/miri/tests/pass/float.rs b/src/tools/miri/tests/pass/float.rs
index fe7316c6680df..5c9e11dcce0ff 100644
--- a/src/tools/miri/tests/pass/float.rs
+++ b/src/tools/miri/tests/pass/float.rs
@@ -1088,6 +1088,8 @@ pub fn libm() {
 
     assert_approx_eq!(1f32.exp_m1(), f32::consts::E - 1.0);
     assert_approx_eq!(1f64.exp_m1(), f64::consts::E - 1.0);
+    assert_approx_eq!(f32::NEG_INFINITY.exp_m1(), -1.0);
+    assert_approx_eq!(f64::NEG_INFINITY.exp_m1(), -1.0);
 
     assert_approx_eq!(10f32.exp2(), 1024f32);
     assert_approx_eq!(50f64.exp2(), 1125899906842624f64);
@@ -1120,9 +1122,10 @@ pub fn libm() {
     assert_approx_eq!(3.0f32.hypot(4.0f32), 5.0f32);
     assert_approx_eq!(3.0f64.hypot(4.0f64), 5.0f64);
 
-    assert_eq!(ldexp(0.65f64, 3i32), 5.2f64);
+    assert_approx_eq!(ldexp(0.65f64, 3i32), 5.2f64);
     assert_eq!(ldexp(1.42, 0xFFFF), f64::INFINITY);
     assert_eq!(ldexp(1.42, -0xFFFF), 0f64);
+    assert_eq!(ldexp(42.0, 0), 42.0);
 
     // Trigonometric functions.
 
@@ -1131,8 +1134,14 @@ pub fn libm() {
     assert_approx_eq!((f64::consts::PI / 2f64).sin(), 1f64);
     assert_approx_eq!(f32::consts::FRAC_PI_6.sin(), 0.5);
     assert_approx_eq!(f64::consts::FRAC_PI_6.sin(), 0.5);
-    assert_approx_eq!(f32::consts::FRAC_PI_4.sin().asin(), f32::consts::FRAC_PI_4);
-    assert_approx_eq!(f64::consts::FRAC_PI_4.sin().asin(), f64::consts::FRAC_PI_4);
+    // Increase error tolerance from 12ULP to 16ULP because of the extra operation.
+    assert_approx_eq!(f32::consts::FRAC_PI_4.sin().asin(), f32::consts::FRAC_PI_4, 16);
+    assert_approx_eq!(f64::consts::FRAC_PI_4.sin().asin(), f64::consts::FRAC_PI_4, 16);
+    assert_biteq(0.0f32.asin(), 0.0f32, "asin(+0) = +0");
+    assert_biteq((-0.0f32).asin(), -0.0, "asin(-0) = -0");
+    assert_biteq(0.0f64.asin(), 0.0, "asin(+0) = +0");
+    assert_biteq((-0.0f64).asin(), -0.0, "asin(-0) = -0");
+
 
     assert_approx_eq!(1.0f32.sinh(), 1.1752012f32);
     assert_approx_eq!(1.0f64.sinh(), 1.1752011936438014f64);
@@ -1159,11 +1168,18 @@ pub fn libm() {
     assert_approx_eq!((f64::consts::PI * 2f64).cos(), 1f64);
     assert_approx_eq!(f32::consts::FRAC_PI_3.cos(), 0.5);
     assert_approx_eq!(f64::consts::FRAC_PI_3.cos(), 0.5);
-    assert_approx_eq!(f32::consts::FRAC_PI_4.cos().acos(), f32::consts::FRAC_PI_4);
-    assert_approx_eq!(f64::consts::FRAC_PI_4.cos().acos(), f64::consts::FRAC_PI_4);
+    // Increase error tolerance from 12ULP to 16ULP because of the extra operation.
+    assert_approx_eq!(f32::consts::FRAC_PI_4.cos().acos(), f32::consts::FRAC_PI_4, 16);
+    assert_approx_eq!(f64::consts::FRAC_PI_4.cos().acos(), f64::consts::FRAC_PI_4, 16);
+    assert_biteq(1.0f32.acos(), 0.0, "acos(1) = 0");
+    assert_biteq(1.0f64.acos(), 0.0, "acos(1) = 0");
 
     assert_approx_eq!(1.0f32.cosh(), 1.54308f32);
     assert_approx_eq!(1.0f64.cosh(), 1.5430806348152437f64);
+    assert_eq!(0.0f32.cosh(), 1.0);
+    assert_eq!(0.0f64.cosh(), 1.0);
+    assert_eq!((-0.0f32).cosh(), 1.0);
+    assert_eq!((-0.0f64).cosh(), 1.0);
     assert_approx_eq!(2.0f32.acosh(), 1.31695789692481670862504634730796844f32);
     assert_approx_eq!(3.0f64.acosh(), 1.76274717403908605046521864995958461f64);
 
@@ -1173,6 +1189,47 @@ pub fn libm() {
     assert_approx_eq!(1.0_f64, 1.0_f64.tan().atan());
     assert_approx_eq!(1.0f32.atan2(2.0f32), 0.46364761f32);
     assert_approx_eq!(1.0f32.atan2(2.0f32), 0.46364761f32);
+    // C standard defines a bunch of fixed outputs for atan2
+    macro_rules! fixed_atan2_cases{
+        ($float_type:ident) => {{
+            use std::$float_type::consts::{PI, FRAC_PI_2, FRAC_PI_4};
+            use $float_type::{INFINITY, NEG_INFINITY};
+
+            // atan2(±0,−0) = ±π.
+            assert_eq!($float_type::atan2(0.0, -0.0), PI, "atan2(0,−0) = π");
+            assert_eq!($float_type::atan2(-0.0, -0.0), -PI, "atan2(-0,−0) = -π");
+
+            // atan2(±0, y) = ±π for y < 0.
+            assert_eq!($float_type::atan2(0.0, -1.0), PI, "atan2(0, y) = π for y < 0.");
+            assert_eq!($float_type::atan2(-0.0, -1.0), -PI, "atan2(-0, y) = -π for y < 0.");
+
+            // atan2(x, ±0) = −π/2 for x < 0.
+            assert_eq!($float_type::atan2(-1.0, 0.0), -FRAC_PI_2, "atan2(x, 0) = −π/2 for x < 0");
+            assert_eq!($float_type::atan2(-1.0, -0.0), -FRAC_PI_2, "atan2(x, -0) = −π/2 for x < 0");
+
+            // atan2(x, ±0) =  π/2 for x > 0.
+            assert_eq!($float_type::atan2(1.0, 0.0), FRAC_PI_2, "atan2(x, 0) =  π/2 for x > 0.");
+            assert_eq!($float_type::atan2(1.0, -0.0), FRAC_PI_2, "atan2(x, -0) =  π/2 for x > 0.");
+
+            // atan2(±x,−∞) = ±π for finite x > 0.
+            assert_eq!($float_type::atan2(1.0, NEG_INFINITY), PI, "atan2(x, −∞) = π for finite x > 0");
+            assert_eq!($float_type::atan2(-1.0, NEG_INFINITY), -PI, "atan2(-x, −∞) = -π for finite x > 0");
+
+            // atan2(±∞, y) returns ±π/2 for finite y.
+            assert_eq!($float_type::atan2(INFINITY, 1.0), FRAC_PI_2, "atan2(+∞, y) returns π/2 for finite y");
+            assert_eq!($float_type::atan2(NEG_INFINITY, 1.0), -FRAC_PI_2, "atan2(-∞, y) returns -π/2 for finite y");
+
+            // atan2(±∞, −∞) = ±3π/4
+            assert_eq!($float_type::atan2(INFINITY, NEG_INFINITY), 3.0 * FRAC_PI_4, "atan2(+∞, −∞) = 3π/4");
+            assert_eq!($float_type::atan2(NEG_INFINITY, NEG_INFINITY), -3.0 * FRAC_PI_4, "atan2(-∞, −∞) = -3π/4");
+
+            // atan2(±∞, +∞) = ±π/4
+            assert_eq!($float_type::atan2(INFINITY, INFINITY), FRAC_PI_4, "atan2(+∞, +∞) = π/4");
+            assert_eq!($float_type::atan2(NEG_INFINITY, INFINITY), -FRAC_PI_4, "atan2(-∞, +∞) = -π/4");
+        }}
+    }
+    fixed_atan2_cases!(f32);
+    fixed_atan2_cases!(f64);
 
     assert_approx_eq!(
         1.0f32.tanh(),
@@ -1182,6 +1239,11 @@ pub fn libm() {
         1.0f64.tanh(),
         (1.0 - f64::consts::E.powi(-2)) / (1.0 + f64::consts::E.powi(-2))
     );
+    assert_eq!(f32::INFINITY.tanh(), 1.0);
+    assert_eq!(f32::NEG_INFINITY.tanh(), -1.0);
+    assert_eq!(f64::INFINITY.tanh(), 1.0);
+    assert_eq!(f64::NEG_INFINITY.tanh(), -1.0);
+
     assert_approx_eq!(0.5f32.atanh(), 0.54930614433405484569762261846126285f32);
     assert_approx_eq!(0.5f64.atanh(), 0.54930614433405484569762261846126285f64);
 
@@ -1202,8 +1264,14 @@ pub fn libm() {
 
     assert_approx_eq!(1.0f32.erf(), 0.84270079294971486934122063508260926f32);
     assert_approx_eq!(1.0f64.erf(), 0.84270079294971486934122063508260926f64);
+    assert_eq!(f32::INFINITY.erf(), 1.0);
+    assert_eq!(f64::INFINITY.erf(), 1.0);
     assert_approx_eq!(1.0f32.erfc(), 0.15729920705028513065877936491739074f32);
     assert_approx_eq!(1.0f64.erfc(), 0.15729920705028513065877936491739074f64);
+    assert_eq!(f32::NEG_INFINITY.erfc(), 2.0);
+    assert_eq!(f64::NEG_INFINITY.erfc(), 2.0);
+    assert_eq!(f32::INFINITY.erfc(), 0.0);
+    assert_eq!(f64::INFINITY.erfc(), 0.0);
 }
 
 fn test_fast() {
@@ -1413,7 +1481,6 @@ fn test_non_determinism() {
     }
     pub fn test_operations_f32(a: f32, b: f32) {
         test_operations_f!(a, b);
-        // FIXME: some are temporarily disabled as it breaks std tests.
         ensure_nondet(|| a.powf(b));
         ensure_nondet(|| a.powi(2));
         ensure_nondet(|| a.log(b));
@@ -1422,35 +1489,34 @@ fn test_non_determinism() {
         ensure_nondet(|| f32::consts::E.ln());
         ensure_nondet(|| 10f32.log10());
         ensure_nondet(|| 8f32.log2());
-        // ensure_nondet(|| 1f32.ln_1p());
-        // ensure_nondet(|| 27.0f32.cbrt());
-        // ensure_nondet(|| 3.0f32.hypot(4.0f32));
+        ensure_nondet(|| 1f32.ln_1p());
+        ensure_nondet(|| 27.0f32.cbrt());
+        ensure_nondet(|| 3.0f32.hypot(4.0f32));
         ensure_nondet(|| 1f32.sin());
         ensure_nondet(|| 1f32.cos());
         // On i686-pc-windows-msvc , these functions are implemented by calling the `f64` version,
         // which means the little rounding errors Miri introduces are discarded by the cast down to
         // `f32`. Just skip the test for them.
-        // if !cfg!(all(target_os = "windows", target_env = "msvc", target_arch = "x86")) {
-        //     ensure_nondet(|| 1.0f32.tan());
-        //     ensure_nondet(|| 1.0f32.asin());
-        //     ensure_nondet(|| 5.0f32.acos());
-        //     ensure_nondet(|| 1.0f32.atan());
-        //     ensure_nondet(|| 1.0f32.atan2(2.0f32));
-        //     ensure_nondet(|| 1.0f32.sinh());
-        //     ensure_nondet(|| 1.0f32.cosh());
-        //     ensure_nondet(|| 1.0f32.tanh());
-        // }
-        // ensure_nondet(|| 1.0f32.asinh());
-        // ensure_nondet(|| 2.0f32.acosh());
-        // ensure_nondet(|| 0.5f32.atanh());
-        // ensure_nondet(|| 5.0f32.gamma());
-        // ensure_nondet(|| 5.0f32.ln_gamma());
-        // ensure_nondet(|| 5.0f32.erf());
-        // ensure_nondet(|| 5.0f32.erfc());
+        if !cfg!(all(target_os = "windows", target_env = "msvc", target_arch = "x86")) {
+            ensure_nondet(|| 1.0f32.tan());
+            ensure_nondet(|| 1.0f32.asin());
+            ensure_nondet(|| 5.0f32.acos());
+            ensure_nondet(|| 1.0f32.atan());
+            ensure_nondet(|| 1.0f32.atan2(2.0f32));
+            ensure_nondet(|| 1.0f32.sinh());
+            ensure_nondet(|| 1.0f32.cosh());
+            ensure_nondet(|| 1.0f32.tanh());
+        }
+        ensure_nondet(|| 1.0f32.asinh());
+        ensure_nondet(|| 2.0f32.acosh());
+        ensure_nondet(|| 0.5f32.atanh());
+        ensure_nondet(|| 5.0f32.gamma());
+        ensure_nondet(|| 5.0f32.ln_gamma());
+        ensure_nondet(|| 5.0f32.erf());
+        ensure_nondet(|| 5.0f32.erfc());
     }
     pub fn test_operations_f64(a: f64, b: f64) {
         test_operations_f!(a, b);
-        // FIXME: some are temporarily disabled as it breaks std tests.
         ensure_nondet(|| a.powf(b));
         ensure_nondet(|| a.powi(2));
         ensure_nondet(|| a.log(b));
@@ -1459,26 +1525,26 @@ fn test_non_determinism() {
         ensure_nondet(|| 3f64.ln());
         ensure_nondet(|| f64::consts::E.log10());
         ensure_nondet(|| f64::consts::E.log2());
-        // ensure_nondet(|| 1f64.ln_1p());
-        // ensure_nondet(|| 27.0f64.cbrt());
-        // ensure_nondet(|| 3.0f64.hypot(4.0f64));
+        ensure_nondet(|| 1f64.ln_1p());
+        ensure_nondet(|| 27.0f64.cbrt());
+        ensure_nondet(|| 3.0f64.hypot(4.0f64));
         ensure_nondet(|| 1f64.sin());
         ensure_nondet(|| 1f64.cos());
-        // ensure_nondet(|| 1.0f64.tan());
-        // ensure_nondet(|| 1.0f64.asin());
-        // ensure_nondet(|| 5.0f64.acos());
-        // ensure_nondet(|| 1.0f64.atan());
-        // ensure_nondet(|| 1.0f64.atan2(2.0f64));
-        // ensure_nondet(|| 1.0f64.sinh());
-        // ensure_nondet(|| 1.0f64.cosh());
-        // ensure_nondet(|| 1.0f64.tanh());
-        // ensure_nondet(|| 1.0f64.asinh());
-        // ensure_nondet(|| 3.0f64.acosh());
-        // ensure_nondet(|| 0.5f64.atanh());
-        // ensure_nondet(|| 5.0f64.gamma());
-        // ensure_nondet(|| 5.0f64.ln_gamma());
-        // ensure_nondet(|| 5.0f64.erf());
-        // ensure_nondet(|| 5.0f64.erfc());
+        ensure_nondet(|| 1.0f64.tan());
+        ensure_nondet(|| 1.0f64.asin());
+        ensure_nondet(|| 5.0f64.acos());
+        ensure_nondet(|| 1.0f64.atan());
+        ensure_nondet(|| 1.0f64.atan2(2.0f64));
+        ensure_nondet(|| 1.0f64.sinh());
+        ensure_nondet(|| 1.0f64.cosh());
+        ensure_nondet(|| 1.0f64.tanh());
+        ensure_nondet(|| 1.0f64.asinh());
+        ensure_nondet(|| 3.0f64.acosh());
+        ensure_nondet(|| 0.5f64.atanh());
+        ensure_nondet(|| 5.0f64.gamma());
+        ensure_nondet(|| 5.0f64.ln_gamma());
+        ensure_nondet(|| 5.0f64.erf());
+        ensure_nondet(|| 5.0f64.erfc());
     }
     pub fn test_operations_f128(a: f128, b: f128) {
         test_operations_f!(a, b);
diff --git a/tests/codegen/gpu_offload/gpu_host.rs b/tests/codegen/gpu_offload/gpu_host.rs
new file mode 100644
index 0000000000000..513e27426bc0e
--- /dev/null
+++ b/tests/codegen/gpu_offload/gpu_host.rs
@@ -0,0 +1,80 @@
+//@ compile-flags: -Zoffload=Enable -Zunstable-options -C opt-level=3  -Clto=fat
+//@ no-prefer-dynamic
+//@ needs-enzyme
+
+// This test is verifying that we generate __tgt_target_data_*_mapper before and after a call to the
+// kernel_1. Better documentation to what each global or variable means is available in the gpu
+// offlaod code, or the LLVM offload documentation. This code does not launch any GPU kernels yet,
+// and will be rewritten once a proper offload frontend has landed.
+//
+// We currently only handle memory transfer for specific calls to functions named `kernel_{num}`,
+// when inside of a function called main. This, too, is a temporary workaround for not having a
+// frontend.
+
+#![no_main]
+
+#[unsafe(no_mangle)]
+fn main() {
+    let mut x = [3.0; 256];
+    kernel_1(&mut x);
+    core::hint::black_box(&x);
+}
+
+// CHECK: %struct.__tgt_offload_entry = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
+// CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 }
+// CHECK: %struct.ident_t = type { i32, i32, i32, i32, ptr }
+// CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+
+// CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024]
+// CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 3]
+// CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0
+// CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1
+// CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section ".omp_offloading_entries", align 1
+// CHECK: @my_struct_global2 = external global %struct.__tgt_kernel_arguments
+// CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+// CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+
+// CHECK:  Function Attrs:
+// CHECK-NEXT: define{{( dso_local)?}} void @main()
+// CHECK-NEXT: start:
+// CHECK-NEXT:   %0 = alloca [8 x i8], align 8
+// CHECK-NEXT:   %x = alloca [1024 x i8], align 16
+// CHECK-NEXT:   %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8
+// CHECK-NEXT:   %.offload_baseptrs = alloca [1 x ptr], align 8
+// CHECK-NEXT:   %.offload_ptrs = alloca [1 x ptr], align 8
+// CHECK-NEXT:   %.offload_sizes = alloca [1 x i64], align 8
+// CHECK-NEXT:   %x.addr = alloca ptr, align 8
+// CHECK-NEXT:   store ptr %x, ptr %x.addr, align 8
+// CHECK-NEXT:   %1 = load ptr, ptr %x.addr, align 8
+// CHECK-NEXT:   %2 = getelementptr inbounds float, ptr %1, i32 0
+// CHECK:        call void @llvm.memset.p0.i64(ptr align 8 %EmptyDesc, i8 0, i64 32, i1 false)
+// CHECK-NEXT:   call void @__tgt_register_lib(ptr %EmptyDesc)
+// CHECK-NEXT:   call void @__tgt_init_all_rtls()
+// CHECK-NEXT:   %3 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK-NEXT:   store ptr %1, ptr %3, align 8
+// CHECK-NEXT:   %4 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK-NEXT:   store ptr %2, ptr %4, align 8
+// CHECK-NEXT:   %5 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK-NEXT:   store i64 1024, ptr %5, align 8
+// CHECK-NEXT:   %6 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK-NEXT:   %7 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK-NEXT:   %8 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK-NEXT:   call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %6, ptr %7, ptr %8, ptr @.offload_maptypes.1, ptr null, ptr null)
+// CHECK-NEXT:   call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x)
+// CHECK-NEXT:   %9 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK-NEXT:   %10 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK-NEXT:   %11 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK-NEXT:   call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %9, ptr %10, ptr %11, ptr @.offload_maptypes.1, ptr null, ptr null)
+// CHECK-NEXT:   call void @__tgt_unregister_lib(ptr %EmptyDesc)
+// CHECK:        store ptr %x, ptr %0, align 8
+// CHECK-NEXT:   call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0)
+// CHECK:        ret void
+// CHECK-NEXT: }
+
+#[unsafe(no_mangle)]
+#[inline(never)]
+pub fn kernel_1(x: &mut [f32; 256]) {
+    for i in 0..256 {
+        x[i] = 21.0;
+    }
+}
diff --git a/tests/ui/print-request/supported-crate-types.rs b/tests/ui/print-request/supported-crate-types.rs
index c8b4c0c1a4164..50185a231eeb0 100644
--- a/tests/ui/print-request/supported-crate-types.rs
+++ b/tests/ui/print-request/supported-crate-types.rs
@@ -8,6 +8,11 @@
 
 //@ check-pass
 
+// FIXME: musl targets are currently statically linked, but running on a musl host
+// requires dynamic linkage, which in turn changes the supported crate types for
+// x86_64-unknown-linux-musl.
+//@ ignore-musl
+
 //@ revisions: wasm musl linux
 
 //@[wasm] compile-flags: --target=wasm32-unknown-unknown --print=supported-crate-types -Zunstable-options
diff --git a/tests/ui/resolve/underscore-bindings-disambiguators.rs b/tests/ui/resolve/underscore-bindings-disambiguators.rs
new file mode 100644
index 0000000000000..8c89b39f8590c
--- /dev/null
+++ b/tests/ui/resolve/underscore-bindings-disambiguators.rs
@@ -0,0 +1,27 @@
+// Regression test for issue #144168 where some `_` bindings were incorrectly only allowed once per
+// module, failing with "error[E0428]: the name `_` is defined multiple times".
+
+// This weird/complex setup is reduced from `zerocopy-0.8.25` where the issue was encountered.
+
+#![crate_type = "lib"]
+
+macro_rules! impl_for_transmute_from {
+    () => {
+        const _: () = {};
+    };
+}
+
+mod impls {
+    use super::*;
+    impl_for_transmute_from!();
+    impl_for_transmute_from!();
+    const _: () = todo!(); //~ ERROR: evaluation panicked
+    const _: () = todo!(); //~ ERROR: evaluation panicked
+    const _: () = todo!(); //~ ERROR: evaluation panicked
+    const _: () = todo!(); //~ ERROR: evaluation panicked
+    const _: () = todo!(); //~ ERROR: evaluation panicked
+}
+use X as Y; //~ ERROR: unresolved import
+use Z as W; //~ ERROR: unresolved import
+
+const _: () = todo!(); //~ ERROR: evaluation panicked
diff --git a/tests/ui/resolve/underscore-bindings-disambiguators.stderr b/tests/ui/resolve/underscore-bindings-disambiguators.stderr
new file mode 100644
index 0000000000000..0c8081a0c5647
--- /dev/null
+++ b/tests/ui/resolve/underscore-bindings-disambiguators.stderr
@@ -0,0 +1,70 @@
+error[E0432]: unresolved import `X`
+  --> $DIR/underscore-bindings-disambiguators.rs:24:5
+   |
+LL | use X as Y;
+   |     -^^^^^
+   |     |
+   |     no `X` in the root
+   |     help: a similar name exists in the module: `_`
+
+error[E0432]: unresolved import `Z`
+  --> $DIR/underscore-bindings-disambiguators.rs:25:5
+   |
+LL | use Z as W;
+   |     -^^^^^
+   |     |
+   |     no `Z` in the root
+   |     help: a similar name exists in the module: `_`
+
+error[E0080]: evaluation panicked: not yet implemented
+  --> $DIR/underscore-bindings-disambiguators.rs:18:19
+   |
+LL |     const _: () = todo!();
+   |                   ^^^^^^^ evaluation of `impls::_` failed here
+   |
+   = note: this error originates in the macro `todo` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation panicked: not yet implemented
+  --> $DIR/underscore-bindings-disambiguators.rs:19:19
+   |
+LL |     const _: () = todo!();
+   |                   ^^^^^^^ evaluation of `impls::_` failed here
+   |
+   = note: this error originates in the macro `todo` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation panicked: not yet implemented
+  --> $DIR/underscore-bindings-disambiguators.rs:20:19
+   |
+LL |     const _: () = todo!();
+   |                   ^^^^^^^ evaluation of `impls::_` failed here
+   |
+   = note: this error originates in the macro `todo` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation panicked: not yet implemented
+  --> $DIR/underscore-bindings-disambiguators.rs:21:19
+   |
+LL |     const _: () = todo!();
+   |                   ^^^^^^^ evaluation of `impls::_` failed here
+   |
+   = note: this error originates in the macro `todo` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation panicked: not yet implemented
+  --> $DIR/underscore-bindings-disambiguators.rs:22:19
+   |
+LL |     const _: () = todo!();
+   |                   ^^^^^^^ evaluation of `impls::_` failed here
+   |
+   = note: this error originates in the macro `todo` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation panicked: not yet implemented
+  --> $DIR/underscore-bindings-disambiguators.rs:27:15
+   |
+LL | const _: () = todo!();
+   |               ^^^^^^^ evaluation of `_` failed here
+   |
+   = note: this error originates in the macro `todo` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error: aborting due to 8 previous errors
+
+Some errors have detailed explanations: E0080, E0432.
+For more information about an error, try `rustc --explain E0080`.
diff --git a/tests/ui/uninhabited/auxiliary/staged-api.rs b/tests/ui/uninhabited/auxiliary/staged-api.rs
index 342ecf020ea5e..925bb1e0c65a0 100644
--- a/tests/ui/uninhabited/auxiliary/staged-api.rs
+++ b/tests/ui/uninhabited/auxiliary/staged-api.rs
@@ -6,3 +6,9 @@ pub struct Foo<T> {
     #[unstable(feature = "unstable", issue = "none")]
     pub field: T,
 }
+
+#[unstable(feature = "my_coro_state", issue = "none")]
+pub enum MyCoroutineState<Y, R> {
+    Yielded(Y),
+    Complete(R),
+}
diff --git a/tests/ui/uninhabited/uninhabited-unstable-field.current.stderr b/tests/ui/uninhabited/uninhabited-unstable-field.current.stderr
deleted file mode 100644
index 9e0feb4c473f8..0000000000000
--- a/tests/ui/uninhabited/uninhabited-unstable-field.current.stderr
+++ /dev/null
@@ -1,22 +0,0 @@
-error[E0004]: non-exhaustive patterns: type `Foo<Void>` is non-empty
-  --> $DIR/uninhabited-unstable-field.rs:13:11
-   |
-LL |     match x {}
-   |           ^
-   |
-note: `Foo<Void>` defined here
-  --> $DIR/auxiliary/staged-api.rs:5:1
-   |
-LL | pub struct Foo<T> {
-   | ^^^^^^^^^^^^^^^^^
-   = note: the matched value is of type `Foo<Void>`
-help: ensure that all possible cases are being handled by adding a match arm with a wildcard pattern as shown
-   |
-LL ~     match x {
-LL +         _ => todo!(),
-LL ~     }
-   |
-
-error: aborting due to 1 previous error
-
-For more information about this error, try `rustc --explain E0004`.
diff --git a/tests/ui/uninhabited/uninhabited-unstable-field.exhaustive.stderr b/tests/ui/uninhabited/uninhabited-unstable-field.exhaustive.stderr
deleted file mode 100644
index 9e0feb4c473f8..0000000000000
--- a/tests/ui/uninhabited/uninhabited-unstable-field.exhaustive.stderr
+++ /dev/null
@@ -1,22 +0,0 @@
-error[E0004]: non-exhaustive patterns: type `Foo<Void>` is non-empty
-  --> $DIR/uninhabited-unstable-field.rs:13:11
-   |
-LL |     match x {}
-   |           ^
-   |
-note: `Foo<Void>` defined here
-  --> $DIR/auxiliary/staged-api.rs:5:1
-   |
-LL | pub struct Foo<T> {
-   | ^^^^^^^^^^^^^^^^^
-   = note: the matched value is of type `Foo<Void>`
-help: ensure that all possible cases are being handled by adding a match arm with a wildcard pattern as shown
-   |
-LL ~     match x {
-LL +         _ => todo!(),
-LL ~     }
-   |
-
-error: aborting due to 1 previous error
-
-For more information about this error, try `rustc --explain E0004`.
diff --git a/tests/ui/uninhabited/uninhabited-unstable-field.rs b/tests/ui/uninhabited/uninhabited-unstable-field.rs
index 9b507c518ab64..321b864aa27de 100644
--- a/tests/ui/uninhabited/uninhabited-unstable-field.rs
+++ b/tests/ui/uninhabited/uninhabited-unstable-field.rs
@@ -1,29 +1,45 @@
 //@ aux-build: staged-api.rs
-//@ revisions: current exhaustive
-
+//! The field of `Pin` used to be public, which would cause `Pin<Void>` to be uninhabited. To remedy
+//! this, we temporarily made it so unstable fields are always considered inhabited. This has now
+//! been reverted, and this file ensures that we don't special-case unstable fields wrt
+//! inhabitedness anymore.
 #![feature(exhaustive_patterns)]
+#![feature(never_type)]
+#![feature(my_coro_state)] // Custom feature from `staged-api.rs`
+#![deny(unreachable_patterns)]
 
 extern crate staged_api;
 
-use staged_api::Foo;
+use staged_api::{Foo, MyCoroutineState};
 
 enum Void {}
 
 fn demo(x: Foo<Void>) {
     match x {}
-    //~^ ERROR non-exhaustive patterns
 }
 
-// Ensure that the pattern is not considered unreachable.
+// Ensure that the pattern is considered unreachable.
 fn demo2(x: Foo<Void>) {
     match x {
-        Foo { .. } => {}
+        Foo { .. } => {} //~ ERROR unreachable
     }
 }
 
 // Same as above, but for wildcard.
 fn demo3(x: Foo<Void>) {
-    match x { _ => {} }
+    match x {
+        _ => {} //~ ERROR unreachable
+    }
+}
+
+fn unstable_enum(x: MyCoroutineState<i32, !>) {
+    match x {
+        MyCoroutineState::Yielded(_) => {}
+    }
+    match x {
+        MyCoroutineState::Yielded(_) => {}
+        MyCoroutineState::Complete(_) => {} //~ ERROR unreachable
+    }
 }
 
 fn main() {}
diff --git a/tests/ui/uninhabited/uninhabited-unstable-field.stderr b/tests/ui/uninhabited/uninhabited-unstable-field.stderr
new file mode 100644
index 0000000000000..a0c9f9366a6b5
--- /dev/null
+++ b/tests/ui/uninhabited/uninhabited-unstable-field.stderr
@@ -0,0 +1,40 @@
+error: unreachable pattern
+  --> $DIR/uninhabited-unstable-field.rs:24:9
+   |
+LL |         Foo { .. } => {}
+   |         ^^^^^^^^^^------
+   |         |
+   |         matches no values because `Foo<Void>` is uninhabited
+   |         help: remove the match arm
+   |
+   = note: to learn more about uninhabited types, see https://doc.rust-lang.org/nomicon/exotic-sizes.html#empty-types
+note: the lint level is defined here
+  --> $DIR/uninhabited-unstable-field.rs:9:9
+   |
+LL | #![deny(unreachable_patterns)]
+   |         ^^^^^^^^^^^^^^^^^^^^
+
+error: unreachable pattern
+  --> $DIR/uninhabited-unstable-field.rs:31:9
+   |
+LL |         _ => {}
+   |         ^------
+   |         |
+   |         matches no values because `Foo<Void>` is uninhabited
+   |         help: remove the match arm
+   |
+   = note: to learn more about uninhabited types, see https://doc.rust-lang.org/nomicon/exotic-sizes.html#empty-types
+
+error: unreachable pattern
+  --> $DIR/uninhabited-unstable-field.rs:41:9
+   |
+LL |         MyCoroutineState::Complete(_) => {}
+   |         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^------
+   |         |
+   |         matches no values because `!` is uninhabited
+   |         help: remove the match arm
+   |
+   = note: to learn more about uninhabited types, see https://doc.rust-lang.org/nomicon/exotic-sizes.html#empty-types
+
+error: aborting due to 3 previous errors
+