@@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
385385 // res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
386386
387387 res &= self_kq_mask->ne [0 ] == mctx->get_n_kv ();
388- res &= self_kq_mask->ne [1 ] == GGML_PAD ( params.ubatch .n_tokens , GGML_KQ_MASK_PAD) ;
388+ res &= self_kq_mask->ne [1 ] == params.ubatch .n_tokens ;
389389
390390 return res;
391391}
@@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
416416 // res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
417417
418418 res &= self_kq_mask->ne [0 ] == mctx->get_base ()->get_n_kv ();
419- res &= self_kq_mask->ne [1 ] == GGML_PAD ( params.ubatch .n_tokens , GGML_KQ_MASK_PAD) ;
419+ res &= self_kq_mask->ne [1 ] == params.ubatch .n_tokens ;
420420
421421 res &= self_kq_mask_swa->ne [0 ] == mctx->get_swa ()->get_n_kv ();
422- res &= self_kq_mask_swa->ne [1 ] == GGML_PAD ( params.ubatch .n_tokens , GGML_KQ_MASK_PAD) ;
422+ res &= self_kq_mask_swa->ne [1 ] == params.ubatch .n_tokens ;
423423
424424 return res;
425425}
@@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
452452 }
453453 }
454454
455- for (int i = n_tokens; i < GGML_PAD ( n_tokens, GGML_KQ_MASK_PAD) ; ++i) {
455+ for (int i = n_tokens; i < n_tokens; ++i) {
456456 for (int j = 0 ; j < n_enc; ++j) {
457457 data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
458458 }
@@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
14701470 auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
14711471
14721472 // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1473- inp->self_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD ( n_tokens, GGML_KQ_MASK_PAD) , 1 , 1 );
1473+ inp->self_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1 , 1 );
14741474 ggml_set_input (inp->self_kq_mask );
14751475
14761476 inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask , GGML_TYPE_F16) : inp->self_kq_mask ;
14771477
14781478 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
1479- inp->self_kq_mask_swa = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD ( n_tokens, GGML_KQ_MASK_PAD) , 1 , 1 );
1479+ inp->self_kq_mask_swa = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1 , 1 );
14801480 ggml_set_input (inp->self_kq_mask_swa );
14811481
14821482 inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask_swa , GGML_TYPE_F16) : inp->self_kq_mask_swa ;
@@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
15581558 inp->self_k_idxs = mctx_cur->build_input_k_idxs (ctx0, ubatch);
15591559 inp->self_v_idxs = mctx_cur->build_input_v_idxs (ctx0, ubatch);
15601560
1561- inp->self_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_kv, GGML_PAD ( n_tokens/n_stream, GGML_KQ_MASK_PAD) , 1 , n_stream);
1561+ inp->self_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1 , n_stream);
15621562 ggml_set_input (inp->self_kq_mask );
15631563
15641564 inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask , GGML_TYPE_F16) : inp->self_kq_mask ;
@@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
17011701
17021702 const int32_t n_enc = !cross->v_embd .empty () ? cross->n_enc : hparams.n_ctx_train ;
17031703
1704- inp->cross_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_enc, GGML_PAD ( n_tokens, GGML_KQ_MASK_PAD) , 1 , 1 );
1704+ inp->cross_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1 , 1 );
17051705 ggml_set_input (inp->cross_kq_mask );
17061706
17071707 inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->cross_kq_mask , GGML_TYPE_F16) : inp->cross_kq_mask ;
@@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
17671767 inp->self_k_idxs = mctx_cur->get_base ()->build_input_k_idxs (ctx0, ubatch);
17681768 inp->self_v_idxs = mctx_cur->get_base ()->build_input_v_idxs (ctx0, ubatch);
17691769
1770- inp->self_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_kv, GGML_PAD ( n_tokens/n_stream, GGML_KQ_MASK_PAD) , 1 , n_stream);
1770+ inp->self_kq_mask = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1 , n_stream);
17711771 ggml_set_input (inp->self_kq_mask );
17721772
17731773 inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask , GGML_TYPE_F16) : inp->self_kq_mask ;
@@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
17811781 inp->self_k_idxs_swa = mctx_cur->get_swa ()->build_input_k_idxs (ctx0, ubatch);
17821782 inp->self_v_idxs_swa = mctx_cur->get_swa ()->build_input_v_idxs (ctx0, ubatch);
17831783
1784- inp->self_kq_mask_swa = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_kv, GGML_PAD ( n_tokens/n_stream, GGML_KQ_MASK_PAD) , 1 , n_stream);
1784+ inp->self_kq_mask_swa = ggml_new_tensor_4d (ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1 , n_stream);
17851785 ggml_set_input (inp->self_kq_mask_swa );
17861786
17871787 inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast (ctx0, inp->self_kq_mask_swa , GGML_TYPE_F16) : inp->self_kq_mask_swa ;
0 commit comments