Revert "Revert "Use nbdd0121 suggestion for reducing the perf impact""

Urgau · Urgau · commit 0c1451cd57c0 · 2022-07-05T11:44:16.000+02:00
This reverts commit e136c3a9348200c261b9b3c1c50a2f6f6a68b4bd.
diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
@@ -14,7 +14,7 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, /* Reg, RegKind, */
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3340,16 +3340,16 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
         Ok(self.tcx.arena.alloc(fn_abi))
     }
 
-    // /// Small heuristic for determining if layout has any float primitive
-    // fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
-    //     match layout.abi {
-    //         Abi::Uninhabited | Abi::Vector { .. } => false,
-    //         Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
-    //         Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
-    //             (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
-    //         }
-    //     }
-    // }
+    /// Small heuristic for determining if layout has any float primitive
+    fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
+        match layout.abi {
+            Abi::Uninhabited | Abi::Vector { .. } => false,
+            Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
+            Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
+                (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
+            }
+        }
+    }
 
     fn fn_abi_adjust_for_abi(
         &self,
@@ -3375,29 +3375,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         // Pass and return structures up to 2 pointers in size by value,
                         // matching `ScalarPair`. LLVM will usually pass these in 2 registers
                         // which is more efficient than by-ref.
-                        let max_by_val_size = Pointer.size(self) * 2;
+                        let ptr_size = Pointer.size(self);
+                        let max_by_val_size = ptr_size * 2;
                         let size = arg.layout.size;
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        // } else if self.has_all_float(&arg.layout) {
-                        //     // We don't want to aggregate floats as an aggregates of Integer
-                        //     // because this will hurt the generated assembly (#93490)
-                        //     //
-                        //     // As an optimization we want to pass homogeneous aggregate of floats
-                        //     // greater than pointer size as indirect
-                        //     if size > Pointer.size(self) {
-                        //         arg.make_indirect();
-                        //     }
-                        // } else {
-                        //     // We want to pass small aggregates as immediates, but using
-                        //     // a LLVM aggregate type for this leads to bad optimizations,
-                        //     // so we pick an appropriately sized integer type instead.
-                        //     //
-                        //     // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
-                        //     // we could do ([f32; 2], u64) which is better but this is the best we
-                        //     // can do right now.
-                        //     arg.cast_to(Reg { kind: RegKind::Integer, size });
+                        } else if size > ptr_size && self.has_all_float(&arg.layout) {
+                            // We don't want to aggregate floats as an aggregates of Integer
+                            // because this will hurt the generated assembly (#93490) but as an
+                            // optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect.
+                            arg.make_indirect();
+                        } else {
+                            // We want to pass small aggregates as immediates, but using
+                            // a LLVM aggregate type for this leads to bad optimizations,
+                            // so we pick an appropriately sized integer type instead.
+                            //
+                            // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
+                            // we could do ([f32; 2], u64) which is better but this is the best we
+                            // can do right now.
+                            arg.cast_to(Reg { kind: RegKind::Integer, size });
                         }
                     }
 
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -15,12 +15,15 @@ pub fn sum_f32(a: f32, b: f32) -> f32 {
     a + b
 }
 
-// CHECK-LABEL: sum_f32x2:
-// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
-// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-LABEL: sum_f64x2:
+// CHECK:      mov     rax, [[PTR_IN:.*]]
+// CHECK-NEXT: movupd  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
+// CHECK-NEXT: movupd  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
+// CHECK-NEXT: addpd   [[XMMB]], [[XMMA]]
+// CHECK-NEXT: movupd  xmmword ptr {{\[}}[[PTR_IN]]{{\]}}, [[XMMB]]
 // CHECK-NEXT: ret
 #[no_mangle]
-pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+pub fn sum_f64x2(a: [f64; 2], b: [f64; 2]) -> [f64; 2] {
     [
         a[0] + b[0],
         a[1] + b[1],
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
@@ -13,7 +13,7 @@ pub struct Foo {
     bar4: f32,
 }
 
-// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
+// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)
 #[no_mangle]
 pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     todo!()

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ pub struct Foo {`
`13`	`13`	`bar4: f32,`
`14`	`14`	`}`
`15`	`15`
`16`		`-// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)`
	`16`	`+// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)`
`17`	`17`	`#[no_mangle]`
`18`	`18`	`pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {`
`19`	`19`	`todo!()`