Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update MISA kernels to latest version #3639

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (df99f96d6cb25ae4786f5c06f4ab7ce6b887d384)
; generated by igemm_codegen.py (54ae5625884dae7e88310c729fcaaddc8e043c29)
;
.include "igemm_bwd_gtcx2_nhwc_bf16_utils.inc"

Expand All @@ -48,6 +48,7 @@
; precision : 'bf16'
; nxb : 0
; nxe : 0
; vector_c : 1
;
; block_size : 256
; lds_total : 16384
Expand Down Expand Up @@ -170,12 +171,12 @@
.set v_co_sub_n_index, 53
.set v_tmp, 56
.set v_wei_tmp_pack, 15
.set v_wei_flag, 62
.set v_pack_k_tmp, 56
.set v_end, 128
.set v_pack_k_tmp, 62
.set v_wei_flag, 66
.set v_end, 132

.set a_c, 64
.set a_end, 128
.set a_c, 68
.set a_end, 132

.text
.globl igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64
Expand Down Expand Up @@ -266,7 +267,7 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1
v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_wei_ik]
v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+5], 1
v_cmp_gt_u32 vcc, s[s_c], v[v_tmp+5]
v_cndmask_b32 v[v_wei_flag], 0, 1, vcc
v_cndmask_b32 v[v_wei_flag], 0, 1 vcc
v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag]

s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1
Expand All @@ -292,7 +293,7 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1
s_mov_b64 exec, -1

v_cmp_gt_u32 vcc, s[s_n], v[v_out_in]
v_cndmask_b32 v[v_tmp], 0, 1, vcc
v_cndmask_b32 v[v_tmp], 0, 1 vcc
v_lshlrev_b32 v[v_out_flag_n], 0, v[v_tmp]
; calculate output offset
s_mov_b32 s[s_out_offset], 0
Expand All @@ -310,9 +311,9 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1
v_add_u32 v[v_out_os], v[v_tmp+4], v[v_tmp]
v_bfe_u32 v[v_tmp+1], v[v_out_flag_n], 0, 1
v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list]
v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1], vcc
v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+1] vcc
v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list]
v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag], vcc
v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag] vcc

s_mov_b32 s1, 64
v_add_u32 v[v_tmp], s1, v[v_out_inb]
Expand All @@ -327,12 +328,12 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1
v_mul_lo_u32 v[v_tmp], s[s_out_stride_wo], v[v_tmp]
v_add_u32 v[v_out_os+1], v[v_tmp+4], v[v_tmp]
v_cmp_gt_u32 vcc, s[s_n], v[v_out_in]
v_cndmask_b32 v[v_tmp], 0, 1, vcc
v_cndmask_b32 v[v_tmp], 0, 1 vcc
v_lshl_or_b32 v[v_out_flag_n], v[v_tmp], 1, v[v_out_flag_n]
v_cmp_gt_u32 vcc, s[s_hi], v[v_out_iho_list+1]
v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp], vcc
v_cndmask_b32 v[v_out_flag+1], 0, v[v_tmp] vcc
v_cmp_gt_u32 vcc, s[s_wi], v[v_out_iwo_list+1]
v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1], vcc
v_cndmask_b32 v[v_out_flag+1], 0, v[v_out_flag+1] vcc
s_mov_b32 s[s_p_out+2], 0xffffffff
s_mov_b32 s[s_p_out+3], 0x27000
; load output, nxe:0
Expand Down Expand Up @@ -421,7 +422,7 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1

v_add_u32 v[v_tmp], s[s_block_gtc_ic], v[v_co_sub_n_index]
v_cmp_gt_u32 vcc, s[s_c], v[v_tmp]
v_cndmask_b32 v[v_in_flag_c], 0, 1, vcc
v_cndmask_b32 v[v_in_flag_c], 0, 1 vcc
; input offset
s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c]
s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c]
Expand All @@ -433,7 +434,7 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1
s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0

s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1
v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index] ; total n*h_dslice*w_dslice
v_add_u32 v[v_in_inb], s[s_block_gtc_inb], v[v_co_sub_m_index]
v_mul_lo_u32 v[v_in_os], s[s_in_stride_wi], v[v_in_inb]
v_lshlrev_b32 v[v_co_sub_n_index], 1, v[v_co_sub_n_index]
v_add_u32 v[v_in_os], v[v_in_os], v[v_co_sub_n_index]
Expand Down Expand Up @@ -474,8 +475,8 @@ igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1

s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os]
ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os]
ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024
ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024
L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_body:
Expand Down Expand Up @@ -559,8 +560,8 @@ L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1
s_cbranch_scc0 L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_finishing
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os]
ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os]
ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
v_mfma_f32_32x32x8bf16_1k v[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], v[a_c+32:a_c+47] ; repeat:1x0, step:0x0, num_a_c:16
ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024
ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024
Expand All @@ -574,8 +575,8 @@ L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1
L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64_mfma_end:
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os]
ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os]
ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:1024
ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024
; k iteration : 0
Expand Down Expand Up @@ -716,7 +717,7 @@ L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1
s_waitcnt lgkmcnt(0)
s_barrier
; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
ds_read_b128 v[v_c:v_c+3], v[v_co_sld]
ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:0
ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
Expand Down Expand Up @@ -820,7 +821,7 @@ L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1
s_waitcnt lgkmcnt(0)
s_barrier
; load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
ds_read_b128 v[v_c:v_c+3], v[v_co_sld]
ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:0
ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
Expand Down Expand Up @@ -863,12 +864,14 @@ L_igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 128
.amdhsa_next_free_vgpr 132
.amdhsa_next_free_sgpr 56
.amdhsa_ieee_mode 0
.amdhsa_dx10_clamp 0
.amdhsa_ieee_mode 1
.amdhsa_dx10_clamp 1
.amdhsa_float_round_mode_32 3
.amdhsa_float_round_mode_16_64 3
.amdhsa_tg_split 0
.amdhsa_accum_offset 64
.amdhsa_accum_offset 68
.end_amdhsa_kernel

.amdgpu_metadata
Expand All @@ -878,7 +881,7 @@ amdhsa.kernels:
- .name: igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64
.symbol: igemm_bwd_gtcx2_nhwc_bf16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x2_1x4x1x64.kd
.sgpr_count: 62
.vgpr_count: 128
.vgpr_count: 132
.kernarg_segment_align: 8
.kernarg_segment_size: 168
.group_segment_fixed_size: 16384
Expand All @@ -887,44 +890,44 @@ amdhsa.kernels:
.reqd_workgroup_size : [256, 1, 1]
.max_flat_workgroup_size: 256
.args:
- { .name: p_in , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
- { .name: p_wei , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
- { .name: p_out , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
- { .name: hi , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32}
- { .name: wi , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32}
- { .name: n_ , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32}
- { .name: k , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32}
- { .name: c , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32}
- { .name: ho , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32}
- { .name: wo , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32}
- { .name: stride_h , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32}
- { .name: stride_w , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32}
- { .name: dilation_h, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32}
- { .name: dilation_w, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32}
- { .name: pad_h , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32}
- { .name: pad_w , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32}
- { .name: y_ , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32}
- { .name: x , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32}
- { .name: dtile_iy , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32}
- { .name: dtile_ix , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32}
- { .name: dtile_dy , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32}
- { .name: dtile_dx , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32}
- { .name: dtile_y , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32}
- { .name: dtile_x , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32}
- { .name: dtile_h , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32}
- { .name: dtile_w , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32}
- { .name: dslice_y , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32}
- { .name: dslice_x , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32}
- { .name: dslice_h , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32}
- { .name: dslice_w , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32}
- { .name: dslice_h_left, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32}
- { .name: dslice_w_left, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32}
- { .name: group , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32}
- { .name: magic_0 , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32}
- { .name: magic_1 , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32}
- { .name: magic_2 , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32}
- { .name: magic_3 , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32}
- { .name: shift_pack_0, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32}
- { .name: ks , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32}
- { .name: p_in_ , .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
- { .name: p_wei_ , .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
- { .name: p_out_ , .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
- { .name: hi_ , .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32}
- { .name: wi_ , .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32}
- { .name: n_ , .size: 4, .offset: 32, .value_kind: by_value, .value_type: i32}
- { .name: k_ , .size: 4, .offset: 36, .value_kind: by_value, .value_type: i32}
- { .name: c_ , .size: 4, .offset: 40, .value_kind: by_value, .value_type: i32}
- { .name: ho_ , .size: 4, .offset: 44, .value_kind: by_value, .value_type: i32}
- { .name: wo_ , .size: 4, .offset: 48, .value_kind: by_value, .value_type: i32}
- { .name: stride_h_ , .size: 4, .offset: 52, .value_kind: by_value, .value_type: i32}
- { .name: stride_w_ , .size: 4, .offset: 56, .value_kind: by_value, .value_type: i32}
- { .name: dilation_h_, .size: 4, .offset: 60, .value_kind: by_value, .value_type: i32}
- { .name: dilation_w_, .size: 4, .offset: 64, .value_kind: by_value, .value_type: i32}
- { .name: pad_h_ , .size: 4, .offset: 68, .value_kind: by_value, .value_type: i32}
- { .name: pad_w_ , .size: 4, .offset: 72, .value_kind: by_value, .value_type: i32}
- { .name: y_ , .size: 4, .offset: 76, .value_kind: by_value, .value_type: i32}
- { .name: x_ , .size: 4, .offset: 80, .value_kind: by_value, .value_type: i32}
- { .name: dtile_iy_ , .size: 4, .offset: 84, .value_kind: by_value, .value_type: i32}
- { .name: dtile_ix_ , .size: 4, .offset: 88, .value_kind: by_value, .value_type: i32}
- { .name: dtile_dy_ , .size: 4, .offset: 92, .value_kind: by_value, .value_type: i32}
- { .name: dtile_dx_ , .size: 4, .offset: 96, .value_kind: by_value, .value_type: i32}
- { .name: dtile_y_ , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32}
- { .name: dtile_x_ , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32}
- { .name: dtile_h_ , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32}
- { .name: dtile_w_ , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32}
- { .name: dslice_y_ , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32}
- { .name: dslice_x_ , .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32}
- { .name: dslice_h_ , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32}
- { .name: dslice_w_ , .size: 4, .offset: 128, .value_kind: by_value, .value_type: i32}
- { .name: dslice_h_left_, .size: 4, .offset: 132, .value_kind: by_value, .value_type: i32}
- { .name: dslice_w_left_, .size: 4, .offset: 136, .value_kind: by_value, .value_type: i32}
- { .name: group_ , .size: 4, .offset: 140, .value_kind: by_value, .value_type: i32}
- { .name: magic_0_ , .size: 4, .offset: 144, .value_kind: by_value, .value_type: i32}
- { .name: magic_1_ , .size: 4, .offset: 148, .value_kind: by_value, .value_type: i32}
- { .name: magic_2_ , .size: 4, .offset: 152, .value_kind: by_value, .value_type: i32}
- { .name: magic_3_ , .size: 4, .offset: 156, .value_kind: by_value, .value_type: i32}
- { .name: shift_pack_0_, .size: 4, .offset: 160, .value_kind: by_value, .value_type: i32}
- { .name: ks_ , .size: 4, .offset: 164, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
Loading