see below, this comment irrelevant now
unlike X and W registers, float registers cluster smaller sizes into larger sizes instead of inheriting the largest register's index+stride: https://developer.arm.com/documentation/dht0002/a/Introducing-NEON/NEON-architecture-overview/NEON-registers
The current impl seems to treat them the same as X and W, ie it assumes that S1 is located at Q1, S2 is at Q2, etc.
from source/lib/hook/nx64/inline_impl.hpp, see ISSUE lines:
union FloatRegister {
Vec128 V128;
f128 Q;
Vec64 V64;
f64 D;
f32 S; // ISSUE: FloatRegister should hold 4 of S?
f16 H;
/* TODO: 8-bit floats? That's a thing? */
// f8 B;
};
struct FloatRegisters {
FloatRegister m_Fr[32];
};
namespace impl {
/* This type is only unioned with GpRegisters, so this is valid. */
struct GpRegisterAccessorImpl {
GpRegisters& Get() {
return *reinterpret_cast<GpRegisters*>(this);
}
};
struct FloatRegisterAccessorImpl {
FloatRegisters& Get() {
return *reinterpret_cast<FloatRegisters*>(this);
}
};
#define ACCESSOR(inherit, name, type, length, arr, member) \
struct name##RegisterAccessor : public inherit##Impl { \
type& operator[](int index) { \
EXL_ASSERT(0 <= index && index < length, "Register index must not be out of bounds!"); \
return Get().arr[index].member; \
// ^ ISSUE: S should use arr[index >> 1], then something like member[(index & 1) ^ 1]? \
} \
}
ACCESSOR(GpRegisterAccessor, Gp64, u64, 31, m_Gp, X);
ACCESSOR(GpRegisterAccessor, Gp32, u32, 31, m_Gp, W);
ACCESSOR(FloatRegisterAccessor, Vector128, Vec128, 32, m_Fr, V128);
ACCESSOR(FloatRegisterAccessor, Vector64, Vec64, 32, m_Fr, V64);
ACCESSOR(FloatRegisterAccessor, Float128, f128, 32, m_Fr, Q);
ACCESSOR(FloatRegisterAccessor, Float64, f64, 32, m_Fr, D);
ACCESSOR(FloatRegisterAccessor, Float32, f32, 32, m_Fr, S);
ACCESSOR(FloatRegisterAccessor, Float16, f16, 32, m_Fr, H);
/* TODO: 8-bit floats? That's a thing? */
//ACCESSOR(FloatRegisterAccessor, Float8, f8, 32, m_Fr, B);
#undef ACCESSOR
}
thanks dt for explaining this and providing the docs, i didn't really do anything, also for a prior solution (somewhat untested):
https://github.com/dt-12345/MSFix/blob/main/source/lib/hook/nx64/inline_impl.hpp#L62-L72
/*
index has to be adjusted because of how the registesr are layed out
|----------Q0----------|
|----D1----||----D0----|
|-S1-||-S0-||-S3-||-S2-|
*/
struct FpRegisterAccessor32 : public FpRegisterAccessorImpl {
float& operator[](int index) {
return Get().m_Fp[index >> 1].S[(index & 1) ^ 1];
}
};
see below, this comment irrelevant now
unlike X and W registers, float registers cluster smaller sizes into larger sizes instead of inheriting the largest register's index+stride: https://developer.arm.com/documentation/dht0002/a/Introducing-NEON/NEON-architecture-overview/NEON-registersThe current impl seems to treat them the same as X and W, ie it assumes that S1 is located at Q1, S2 is at Q2, etc.from source/lib/hook/nx64/inline_impl.hpp, see ISSUE lines:
thanks dt for explaining this and providing the docs, i didn't really do anything, also for a prior solution (somewhat untested):
https://github.com/dt-12345/MSFix/blob/main/source/lib/hook/nx64/inline_impl.hpp#L62-L72