diff --git a/src/counter_analysis_toolkit/Makefile b/src/counter_analysis_toolkit/Makefile index 31f1a4405..09a16fa1c 100644 --- a/src/counter_analysis_toolkit/Makefile +++ b/src/counter_analysis_toolkit/Makefile @@ -54,10 +54,10 @@ ifeq ($(ARCH),POWER) VEC_ALL=$(VEC) -DPOWER endif ifeq ($(ARCH),ARM) - FLOP+=-march=armv8.2-a+fp16 -DARM + FLOP+=-march=armv8.2-a+fp16+sve -DARM VECSRC=vec_fma_hp.o vec_fma_sp.o vec_fma_dp.o vec_nonfma_hp.o vec_nonfma_sp.o vec_nonfma_dp.o - VEC=-march=armv8.2-a+fp16 -O0 -DARM - VEC_FMA=-march=armv8.2-a+fp16 -O0 -DARM + VEC=-march=armv8.2-a+fp16+sve -O0 -DARM + VEC_FMA=-march=armv8.2-a+fp16+sve -O0 -DARM VEC_ALL=$(VEC) -O0 -DARM endif diff --git a/src/counter_analysis_toolkit/cat_arch.h b/src/counter_analysis_toolkit/cat_arch.h index db6b62e1e..6cb5f6691 100644 --- a/src/counter_analysis_toolkit/cat_arch.h +++ b/src/counter_analysis_toolkit/cat_arch.h @@ -3,29 +3,29 @@ typedef unsigned long long uint64; #if defined(X86) -void test_hp_x86_128B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_x86_128B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_x86_128B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_x86_128B_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_x86_128B_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_x86_128B_VEC( int instr_per_loop, int EventSet, FILE *fp ); -void test_hp_x86_256B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_x86_256B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_x86_256B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_x86_256B_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_x86_256B_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_x86_256B_VEC( int instr_per_loop, int EventSet, FILE *fp ); -void test_hp_x86_512B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_x86_512B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_x86_512B_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_x86_512B_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_x86_512B_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_x86_512B_VEC( int instr_per_loop, int EventSet, FILE *fp ); -void test_hp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_x86_128B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_x86_128B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_x86_128B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); -void test_hp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_x86_256B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_x86_256B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_x86_256B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); -void test_hp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_x86_512B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_x86_512B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_x86_512B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); #include @@ -86,18 +86,46 @@ typedef __m256d DP_VEC_TYPE; #endif #elif defined(ARM) -void test_hp_arm_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_arm_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_arm_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_hp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_arm_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_arm_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_arm_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_hp_arm_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_arm_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_arm_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); #include +#include typedef __fp16 half; +typedef __fp16 HP_SCALAR_TYPE; typedef float SP_SCALAR_TYPE; typedef double DP_SCALAR_TYPE; + +#define CAT_DEV_SVE +#if defined(CAT_DEV_SVE) +typedef svfloat16_t HP_VEC_TYPE; +typedef svfloat32_t SP_VEC_TYPE; +typedef svfloat64_t DP_VEC_TYPE; + +#define SET_VEC_PH(_I_) (HP_VEC_TYPE)svdup_n_f16( _I_ ) +#define SET_VEC_PS(_I_) (SP_VEC_TYPE)svdup_n_f32( _I_ ); +#define SET_VEC_PD(_I_) (DP_VEC_TYPE)svdup_n_f64( _I_ ); + +#define ADD_VEC_PH(_I_,_J_) (HP_VEC_TYPE)svadd_f16_m( pg, _I_ , _J_ ); +#define ADD_VEC_PS(_I_,_J_) (SP_VEC_TYPE)svadd_f32_m( pg, _I_ , _J_ ); +#define ADD_VEC_PD(_I_,_J_) (DP_VEC_TYPE)svadd_f64_m( pg, _I_ , _J_ ); + +#define MUL_VEC_PH(_I_,_J_) (HP_VEC_TYPE)svmul_f16_m( pg, _I_ , _J_ ); +#define MUL_VEC_PS(_I_,_J_) (SP_VEC_TYPE)svmul_f32_m( pg, _I_ , _J_ ); +#define MUL_VEC_PD(_I_,_J_) (DP_VEC_TYPE)svmul_f64_m( pg, _I_ , _J_ ); + +#define FMA_VEC_PH(_I_,_J_,_K_) (HP_VEC_TYPE)svmad_f16_m( pg, _I_ , _J_ , _K_ ); +#define FMA_VEC_PS(_I_,_J_,_K_) (SP_VEC_TYPE)svmad_f32_m( pg, _I_ , _J_ , _K_ ); +#define FMA_VEC_PD(_I_,_J_,_K_) (DP_VEC_TYPE)svmad_f64_m( pg, _I_ , _J_ , _K_ ); +#endif + +//#define CAT_DEV_NEON +#if defined(CAT_DEV_NEON) typedef float16x8_t HP_VEC_TYPE; typedef float32x4_t SP_VEC_TYPE; typedef float64x2_t DP_VEC_TYPE; @@ -117,11 +145,12 @@ typedef float64x2_t DP_VEC_TYPE; #define FMA_VEC_PH(_I_,_J_,_K_) (HP_VEC_TYPE)vfmaq_f16( _K_ , _J_ , _I_ ); #define FMA_VEC_PS(_I_,_J_,_K_) (SP_VEC_TYPE)vfmaq_f32( _K_ , _J_ , _I_ ); #define FMA_VEC_PD(_I_,_J_,_K_) (DP_VEC_TYPE)vfmaq_f64( _K_ , _J_ , _I_ ); +#endif /* CAT_DEV_NEON */ /* There is no scalar FMA intrinsic available on this architecture. */ #define SET_VEC_SH(_I_) _I_ ; -#define ADD_VEC_SH(_I_,_J_) vaddh_f16( _I_ , _J_ ); -#define MUL_VEC_SH(_I_,_J_) vmulh_f16( _I_ , _J_ ); +#define ADD_VEC_SH(_I_,_J_) _I_ + _J_ ; +#define MUL_VEC_SH(_I_,_J_) _I_ * _J_ ; #define SQRT_VEC_SH(_I_) vsqrth_f16( _I_ ); #define FMA_VEC_SH(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; @@ -136,12 +165,12 @@ typedef float64x2_t DP_VEC_TYPE; #define FMA_VEC_SD(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; #elif defined(POWER) -void test_hp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_hp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_sp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); -void test_dp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +void test_hp_power_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_power_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_power_VEC( int instr_per_loop, int EventSet, FILE *fp ); +void test_hp_power_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_sp_power_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); +void test_dp_power_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); #include diff --git a/src/counter_analysis_toolkit/vec.c b/src/counter_analysis_toolkit/vec.c index e33f148b1..26875b570 100644 --- a/src/counter_analysis_toolkit/vec.c +++ b/src/counter_analysis_toolkit/vec.c @@ -48,151 +48,151 @@ void vec_driver(char* papi_event_name, hw_desc_t *hw_desc, char* outdir) // HP Non-FMA instruction trials. fprintf(ofp_papi, "# HP Non-FMA Scalar\n"); - test_hp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_24( EventSet, ofp_papi ); + test_hp_scalar_VEC_48( EventSet, ofp_papi ); + test_hp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# HP Non-FMA Vector AVX128\n"); - test_hp_x86_128B_VEC( 24, ITER, EventSet, ofp_papi ); - test_hp_x86_128B_VEC( 48, ITER, EventSet, ofp_papi ); - test_hp_x86_128B_VEC( 96, ITER, EventSet, ofp_papi ); + test_hp_x86_128B_VEC( 24, EventSet, ofp_papi ); + test_hp_x86_128B_VEC( 48, EventSet, ofp_papi ); + test_hp_x86_128B_VEC( 96, EventSet, ofp_papi ); #if defined(AVX256_AVAIL) fprintf(ofp_papi, "# HP Non-FMA Vector AVX256\n"); - test_hp_x86_256B_VEC( 24, ITER, EventSet, ofp_papi ); - test_hp_x86_256B_VEC( 48, ITER, EventSet, ofp_papi ); - test_hp_x86_256B_VEC( 96, ITER, EventSet, ofp_papi ); + test_hp_x86_256B_VEC( 24, EventSet, ofp_papi ); + test_hp_x86_256B_VEC( 48, EventSet, ofp_papi ); + test_hp_x86_256B_VEC( 96, EventSet, ofp_papi ); #if defined(AVX512_AVAIL) fprintf(ofp_papi, "# HP Non-FMA Vector AVX512\n"); - test_hp_x86_512B_VEC( 24, ITER, EventSet, ofp_papi ); - test_hp_x86_512B_VEC( 48, ITER, EventSet, ofp_papi ); - test_hp_x86_512B_VEC( 96, ITER, EventSet, ofp_papi ); + test_hp_x86_512B_VEC( 24, EventSet, ofp_papi ); + test_hp_x86_512B_VEC( 48, EventSet, ofp_papi ); + test_hp_x86_512B_VEC( 96, EventSet, ofp_papi ); #endif #endif // SP Non-FMA instruction trials. fprintf(ofp_papi, "# SP Non-FMA Scalar\n"); - test_sp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_24( EventSet, ofp_papi ); + test_sp_scalar_VEC_48( EventSet, ofp_papi ); + test_sp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# SP Non-FMA Vector AVX128\n"); - test_sp_x86_128B_VEC( 24, ITER, EventSet, ofp_papi ); - test_sp_x86_128B_VEC( 48, ITER, EventSet, ofp_papi ); - test_sp_x86_128B_VEC( 96, ITER, EventSet, ofp_papi ); + test_sp_x86_128B_VEC( 24, EventSet, ofp_papi ); + test_sp_x86_128B_VEC( 48, EventSet, ofp_papi ); + test_sp_x86_128B_VEC( 96, EventSet, ofp_papi ); #if defined(AVX256_AVAIL) fprintf(ofp_papi, "# SP Non-FMA Vector AVX256\n"); - test_sp_x86_256B_VEC( 24, ITER, EventSet, ofp_papi ); - test_sp_x86_256B_VEC( 48, ITER, EventSet, ofp_papi ); - test_sp_x86_256B_VEC( 96, ITER, EventSet, ofp_papi ); + test_sp_x86_256B_VEC( 24, EventSet, ofp_papi ); + test_sp_x86_256B_VEC( 48, EventSet, ofp_papi ); + test_sp_x86_256B_VEC( 96, EventSet, ofp_papi ); #if defined(AVX512_AVAIL) fprintf(ofp_papi, "# SP Non-FMA Vector AVX512\n"); - test_sp_x86_512B_VEC( 24, ITER, EventSet, ofp_papi ); - test_sp_x86_512B_VEC( 48, ITER, EventSet, ofp_papi ); - test_sp_x86_512B_VEC( 96, ITER, EventSet, ofp_papi ); + test_sp_x86_512B_VEC( 24, EventSet, ofp_papi ); + test_sp_x86_512B_VEC( 48, EventSet, ofp_papi ); + test_sp_x86_512B_VEC( 96, EventSet, ofp_papi ); #endif #endif // DP Non-FMA instruction trials. fprintf(ofp_papi, "# DP Non-FMA Scalar\n"); - test_dp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_24( EventSet, ofp_papi ); + test_dp_scalar_VEC_48( EventSet, ofp_papi ); + test_dp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# DP Non-FMA Vector AVX128\n"); - test_dp_x86_128B_VEC( 24, ITER, EventSet, ofp_papi ); - test_dp_x86_128B_VEC( 48, ITER, EventSet, ofp_papi ); - test_dp_x86_128B_VEC( 96, ITER, EventSet, ofp_papi ); + test_dp_x86_128B_VEC( 24, EventSet, ofp_papi ); + test_dp_x86_128B_VEC( 48, EventSet, ofp_papi ); + test_dp_x86_128B_VEC( 96, EventSet, ofp_papi ); #if defined(AVX256_AVAIL) fprintf(ofp_papi, "# DP Non-FMA Vector AVX256\n"); - test_dp_x86_256B_VEC( 24, ITER, EventSet, ofp_papi ); - test_dp_x86_256B_VEC( 48, ITER, EventSet, ofp_papi ); - test_dp_x86_256B_VEC( 96, ITER, EventSet, ofp_papi ); + test_dp_x86_256B_VEC( 24, EventSet, ofp_papi ); + test_dp_x86_256B_VEC( 48, EventSet, ofp_papi ); + test_dp_x86_256B_VEC( 96, EventSet, ofp_papi ); #if defined(AVX512_AVAIL) fprintf(ofp_papi, "# DP Non-FMA Vector AVX512\n"); - test_dp_x86_512B_VEC( 24, ITER, EventSet, ofp_papi ); - test_dp_x86_512B_VEC( 48, ITER, EventSet, ofp_papi ); - test_dp_x86_512B_VEC( 96, ITER, EventSet, ofp_papi ); + test_dp_x86_512B_VEC( 24, EventSet, ofp_papi ); + test_dp_x86_512B_VEC( 48, EventSet, ofp_papi ); + test_dp_x86_512B_VEC( 96, EventSet, ofp_papi ); #endif #endif // HP FMA instruction trials. fprintf(ofp_papi, "# HP FMA Scalar\n"); - test_hp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# HP FMA Vector AVX128\n"); - test_hp_x86_128B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_hp_x86_128B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_hp_x86_128B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_hp_x86_128B_VEC_FMA( 12, EventSet, ofp_papi ); + test_hp_x86_128B_VEC_FMA( 24, EventSet, ofp_papi ); + test_hp_x86_128B_VEC_FMA( 48, EventSet, ofp_papi ); #if defined(AVX256_AVAIL) fprintf(ofp_papi, "# HP FMA Vector AVX256\n"); - test_hp_x86_256B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_hp_x86_256B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_hp_x86_256B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_hp_x86_256B_VEC_FMA( 12, EventSet, ofp_papi ); + test_hp_x86_256B_VEC_FMA( 24, EventSet, ofp_papi ); + test_hp_x86_256B_VEC_FMA( 48, EventSet, ofp_papi ); #if defined(AVX512_AVAIL) fprintf(ofp_papi, "# HP FMA Vector AVX512\n"); - test_hp_x86_512B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_hp_x86_512B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_hp_x86_512B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_hp_x86_512B_VEC_FMA( 12, EventSet, ofp_papi ); + test_hp_x86_512B_VEC_FMA( 24, EventSet, ofp_papi ); + test_hp_x86_512B_VEC_FMA( 48, EventSet, ofp_papi ); #endif #endif // SP FMA instruction trials. fprintf(ofp_papi, "# SP FMA Scalar\n"); - test_sp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# SP FMA Vector AVX128\n"); - test_sp_x86_128B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_sp_x86_128B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_sp_x86_128B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_sp_x86_128B_VEC_FMA( 12, EventSet, ofp_papi ); + test_sp_x86_128B_VEC_FMA( 24, EventSet, ofp_papi ); + test_sp_x86_128B_VEC_FMA( 48, EventSet, ofp_papi ); #if defined(AVX256_AVAIL) fprintf(ofp_papi, "# SP FMA Vector AVX256\n"); - test_sp_x86_256B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_sp_x86_256B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_sp_x86_256B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_sp_x86_256B_VEC_FMA( 12, EventSet, ofp_papi ); + test_sp_x86_256B_VEC_FMA( 24, EventSet, ofp_papi ); + test_sp_x86_256B_VEC_FMA( 48, EventSet, ofp_papi ); #if defined(AVX512_AVAIL) fprintf(ofp_papi, "# SP FMA Vector AVX512\n"); - test_sp_x86_512B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_sp_x86_512B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_sp_x86_512B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_sp_x86_512B_VEC_FMA( 12, EventSet, ofp_papi ); + test_sp_x86_512B_VEC_FMA( 24, EventSet, ofp_papi ); + test_sp_x86_512B_VEC_FMA( 48, EventSet, ofp_papi ); #endif #endif // DP FMA instruction trials. fprintf(ofp_papi, "# DP FMA Scalar\n"); - test_dp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# DP FMA Vector AVX128\n"); - test_dp_x86_128B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_dp_x86_128B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_dp_x86_128B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_dp_x86_128B_VEC_FMA( 12, EventSet, ofp_papi ); + test_dp_x86_128B_VEC_FMA( 24, EventSet, ofp_papi ); + test_dp_x86_128B_VEC_FMA( 48, EventSet, ofp_papi ); #if defined(AVX256_AVAIL) fprintf(ofp_papi, "# DP FMA Vector AVX256\n"); - test_dp_x86_256B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_dp_x86_256B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_dp_x86_256B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_dp_x86_256B_VEC_FMA( 12, EventSet, ofp_papi ); + test_dp_x86_256B_VEC_FMA( 24, EventSet, ofp_papi ); + test_dp_x86_256B_VEC_FMA( 48, EventSet, ofp_papi ); #if defined(AVX512_AVAIL) fprintf(ofp_papi, "# DP FMA Vector AVX512\n"); - test_dp_x86_512B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_dp_x86_512B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_dp_x86_512B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_dp_x86_512B_VEC_FMA( 12, EventSet, ofp_papi ); + test_dp_x86_512B_VEC_FMA( 24, EventSet, ofp_papi ); + test_dp_x86_512B_VEC_FMA( 48, EventSet, ofp_papi ); #endif #endif @@ -204,129 +204,129 @@ void vec_driver(char* papi_event_name, hw_desc_t *hw_desc, char* outdir) // Non-FMA instruction trials. fprintf(ofp_papi, "# HP Non-FMA Scalar\n"); - test_hp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_24( EventSet, ofp_papi ); + test_hp_scalar_VEC_48( EventSet, ofp_papi ); + test_hp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# HP Non-FMA Vector\n"); - test_hp_arm_VEC( 24, ITER, EventSet, ofp_papi ); - test_hp_arm_VEC( 48, ITER, EventSet, ofp_papi ); - test_hp_arm_VEC( 96, ITER, EventSet, ofp_papi ); + test_hp_arm_VEC( 24, EventSet, ofp_papi ); + test_hp_arm_VEC( 48, EventSet, ofp_papi ); + test_hp_arm_VEC( 96, EventSet, ofp_papi ); fprintf(ofp_papi, "# SP Non-FMA Scalar\n"); - test_sp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_24( EventSet, ofp_papi ); + test_sp_scalar_VEC_48( EventSet, ofp_papi ); + test_sp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# SP Non-FMA Vector\n"); - test_sp_arm_VEC( 24, ITER, EventSet, ofp_papi ); - test_sp_arm_VEC( 48, ITER, EventSet, ofp_papi ); - test_sp_arm_VEC( 96, ITER, EventSet, ofp_papi ); + test_sp_arm_VEC( 24, EventSet, ofp_papi ); + test_sp_arm_VEC( 48, EventSet, ofp_papi ); + test_sp_arm_VEC( 96, EventSet, ofp_papi ); fprintf(ofp_papi, "# DP Non-FMA Scalar\n"); - test_dp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_24( EventSet, ofp_papi ); + test_dp_scalar_VEC_48( EventSet, ofp_papi ); + test_dp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# DP Non-FMA Vector\n"); - test_dp_arm_VEC( 24, ITER, EventSet, ofp_papi ); - test_dp_arm_VEC( 48, ITER, EventSet, ofp_papi ); - test_dp_arm_VEC( 96, ITER, EventSet, ofp_papi ); + test_dp_arm_VEC( 24, EventSet, ofp_papi ); + test_dp_arm_VEC( 48, EventSet, ofp_papi ); + test_dp_arm_VEC( 96, EventSet, ofp_papi ); // FMA instruction trials. fprintf(ofp_papi, "# HP FMA Scalar\n"); - test_hp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# HP FMA Vector\n"); - test_hp_arm_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_hp_arm_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_hp_arm_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_hp_arm_VEC_FMA( 12, EventSet, ofp_papi ); + test_hp_arm_VEC_FMA( 24, EventSet, ofp_papi ); + test_hp_arm_VEC_FMA( 48, EventSet, ofp_papi ); fprintf(ofp_papi, "# SP FMA Scalar\n"); - test_sp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# SP FMA Vector\n"); - test_sp_arm_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_sp_arm_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_sp_arm_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_sp_arm_VEC_FMA( 12, EventSet, ofp_papi ); + test_sp_arm_VEC_FMA( 24, EventSet, ofp_papi ); + test_sp_arm_VEC_FMA( 48, EventSet, ofp_papi ); fprintf(ofp_papi, "# DP FMA Scalar\n"); - test_dp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# DP FMA Vector\n"); - test_dp_arm_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_dp_arm_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_dp_arm_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_dp_arm_VEC_FMA( 12, EventSet, ofp_papi ); + test_dp_arm_VEC_FMA( 24, EventSet, ofp_papi ); + test_dp_arm_VEC_FMA( 48, EventSet, ofp_papi ); #elif defined(POWER) // Non-FMA instruction trials. fprintf(ofp_papi, "# HP Non-FMA Scalar\n"); - test_hp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_24( EventSet, ofp_papi ); + test_hp_scalar_VEC_48( EventSet, ofp_papi ); + test_hp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# HP Non-FMA Vector\n"); - test_hp_power_VEC( 24, ITER, EventSet, ofp_papi ); - test_hp_power_VEC( 48, ITER, EventSet, ofp_papi ); - test_hp_power_VEC( 96, ITER, EventSet, ofp_papi ); + test_hp_power_VEC( 24, EventSet, ofp_papi ); + test_hp_power_VEC( 48, EventSet, ofp_papi ); + test_hp_power_VEC( 96, EventSet, ofp_papi ); fprintf(ofp_papi, "# SP Non-FMA Scalar\n"); - test_sp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_24( EventSet, ofp_papi ); + test_sp_scalar_VEC_48( EventSet, ofp_papi ); + test_sp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# SP Non-FMA Vector\n"); - test_sp_power_VEC( 24, ITER, EventSet, ofp_papi ); - test_sp_power_VEC( 48, ITER, EventSet, ofp_papi ); - test_sp_power_VEC( 96, ITER, EventSet, ofp_papi ); + test_sp_power_VEC( 24, EventSet, ofp_papi ); + test_sp_power_VEC( 48, EventSet, ofp_papi ); + test_sp_power_VEC( 96, EventSet, ofp_papi ); fprintf(ofp_papi, "# DP Non-FMA Scalar\n"); - test_dp_scalar_VEC_24( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_48( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_24( EventSet, ofp_papi ); + test_dp_scalar_VEC_48( EventSet, ofp_papi ); + test_dp_scalar_VEC_96( EventSet, ofp_papi ); fprintf(ofp_papi, "# DP Non-FMA Vector\n"); - test_dp_power_VEC( 24, ITER, EventSet, ofp_papi ); - test_dp_power_VEC( 48, ITER, EventSet, ofp_papi ); - test_dp_power_VEC( 96, ITER, EventSet, ofp_papi ); + test_dp_power_VEC( 24, EventSet, ofp_papi ); + test_dp_power_VEC( 48, EventSet, ofp_papi ); + test_dp_power_VEC( 96, EventSet, ofp_papi ); // FMA instruction trials. fprintf(ofp_papi, "# HP FMA Scalar\n"); - test_hp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_hp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# HP FMA Vector\n"); - test_hp_power_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_hp_power_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_hp_power_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_hp_power_VEC_FMA( 12, EventSet, ofp_papi ); + test_hp_power_VEC_FMA( 24, EventSet, ofp_papi ); + test_hp_power_VEC_FMA( 48, EventSet, ofp_papi ); fprintf(ofp_papi, "# SP FMA Scalar\n"); - test_sp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_sp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# SP FMA Vector\n"); - test_sp_power_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_sp_power_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_sp_power_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_sp_power_VEC_FMA( 12, EventSet, ofp_papi ); + test_sp_power_VEC_FMA( 24, EventSet, ofp_papi ); + test_sp_power_VEC_FMA( 48, EventSet, ofp_papi ); fprintf(ofp_papi, "# DP FMA Scalar\n"); - test_dp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); - test_dp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_12( EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_24( EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_48( EventSet, ofp_papi ); fprintf(ofp_papi, "# DP FMA Vector\n"); - test_dp_power_VEC_FMA( 12, ITER, EventSet, ofp_papi ); - test_dp_power_VEC_FMA( 24, ITER, EventSet, ofp_papi ); - test_dp_power_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + test_dp_power_VEC_FMA( 12, EventSet, ofp_papi ); + test_dp_power_VEC_FMA( 24, EventSet, ofp_papi ); + test_dp_power_VEC_FMA( 48, EventSet, ofp_papi ); #endif diff --git a/src/counter_analysis_toolkit/vec_fma_dp.c b/src/counter_analysis_toolkit/vec_fma_dp.c index 0f8e4ede3..5ee664346 100644 --- a/src/counter_analysis_toolkit/vec_fma_dp.c +++ b/src/counter_analysis_toolkit/vec_fma_dp.c @@ -1,30 +1,30 @@ #include "vec_scalar_verify.h" -static double test_dp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); -static double test_dp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); -static double test_dp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); -static void test_dp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); +static double test_dp_mac_VEC_FMA_12( int EventSet, FILE *fp ); +static double test_dp_mac_VEC_FMA_24( int EventSet, FILE *fp ); +static double test_dp_mac_VEC_FMA_48( int EventSet, FILE *fp ); +static void test_dp_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ); /* Wrapper functions of different vector widths. */ #if defined(X86_VEC_WIDTH_128B) -void test_dp_x86_128B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) { - return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp ); +void test_dp_x86_128B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ) { + return test_dp_VEC_FMA( instr_per_loop, EventSet, fp ); } #elif defined(X86_VEC_WIDTH_512B) -void test_dp_x86_512B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) { - return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp ); +void test_dp_x86_512B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ) { + return test_dp_VEC_FMA( instr_per_loop, EventSet, fp ); } #elif defined(X86_VEC_WIDTH_256B) -void test_dp_x86_256B_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) { - return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp ); +void test_dp_x86_256B_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ) { + return test_dp_VEC_FMA( instr_per_loop, EventSet, fp ); } #elif defined(ARM) -void test_dp_arm_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) { - return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp ); +void test_dp_arm_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ) { + return test_dp_VEC_FMA( instr_per_loop, EventSet, fp ); } #elif defined(POWER) -void test_dp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ) { - return test_dp_VEC_FMA( instr_per_loop, iterations, EventSet, fp ); +void test_dp_power_VEC_FMA( int instr_per_loop, int EventSet, FILE *fp ) { + return test_dp_VEC_FMA( instr_per_loop, EventSet, fp ); } #endif @@ -32,8 +32,13 @@ void test_dp_power_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, /* Loop unrolling: 12 instructions */ /************************************/ static -double test_dp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ - register DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; +double test_dp_mac_VEC_FMA_12( int EventSet, FILE *fp ){ + + svbool_t pg = svptrue_b64(); + volatile DP_VEC_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; + double values = 0.0; + long long iterValues[1]; iterValues[0] = 0; + for (int iter=0; iter #include "cat_arch.h" -#define ITER 1 +#define ITERS 100000 void papi_stop_and_print_placeholder(long long theory, FILE *fp); -void papi_stop_and_print(long long theory, int EventSet, FILE *fp); +void papi_print(long long theory, FILE *fp, double values); // Non-FMA-like computations. #if defined(ARM) -half test_hp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); -half test_hp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); -half test_hp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); +half test_hp_scalar_VEC_24( int EventSet, FILE *fp ); +half test_hp_scalar_VEC_48( int EventSet, FILE *fp ); +half test_hp_scalar_VEC_96( int EventSet, FILE *fp ); #else -float test_hp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); -float test_hp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); -float test_hp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); +float test_hp_scalar_VEC_24( int EventSet, FILE *fp ); +float test_hp_scalar_VEC_48( int EventSet, FILE *fp ); +float test_hp_scalar_VEC_96( int EventSet, FILE *fp ); #endif -float test_sp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); -float test_sp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); -float test_sp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); +float test_sp_scalar_VEC_24( int EventSet, FILE *fp ); +float test_sp_scalar_VEC_48( int EventSet, FILE *fp ); +float test_sp_scalar_VEC_96( int EventSet, FILE *fp ); -double test_dp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); -double test_dp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); -double test_dp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); +double test_dp_scalar_VEC_24( int EventSet, FILE *fp ); +double test_dp_scalar_VEC_48( int EventSet, FILE *fp ); +double test_dp_scalar_VEC_96( int EventSet, FILE *fp ); // Functions to emulate FMA. #if defined(ARM) -half test_hp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); -half test_hp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); -half test_hp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); +half test_hp_scalar_VEC_FMA_12( int EventSet, FILE *fp ); +half test_hp_scalar_VEC_FMA_24( int EventSet, FILE *fp ); +half test_hp_scalar_VEC_FMA_48( int EventSet, FILE *fp ); #else -float test_hp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); -float test_hp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); -float test_hp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); +float test_hp_scalar_VEC_FMA_12( int EventSet, FILE *fp ); +float test_hp_scalar_VEC_FMA_24( int EventSet, FILE *fp ); +float test_hp_scalar_VEC_FMA_48( int EventSet, FILE *fp ); #endif -float test_sp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); -float test_sp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); -float test_sp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); +float test_sp_scalar_VEC_FMA_12( int EventSet, FILE *fp ); +float test_sp_scalar_VEC_FMA_24( int EventSet, FILE *fp ); +float test_sp_scalar_VEC_FMA_48( int EventSet, FILE *fp ); -double test_dp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); -double test_dp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); -double test_dp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); +double test_dp_scalar_VEC_FMA_12( int EventSet, FILE *fp ); +double test_dp_scalar_VEC_FMA_24( int EventSet, FILE *fp ); +double test_dp_scalar_VEC_FMA_48( int EventSet, FILE *fp );