From 60e0e3ab38bf5e1b3f2fd0c87e8430bac3c90dbe Mon Sep 17 00:00:00 2001 From: Yong Qin Date: Tue, 29 Jun 2021 15:18:17 -0700 Subject: [PATCH 01/27] Added APIs to support crossed and crossing MKey registration. --- src/uct/ib/base/ib_md.h | 49 ++++++++++++ src/uct/ib/mlx5/dv/ib_mlx5_ifc.h | 25 ++++-- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 128 ++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 8 deletions(-) diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index dd8e8e399c6..28ce0c85222 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -320,6 +320,53 @@ typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md, typedef ucs_status_t (*uct_ib_md_get_atomic_mr_id_func_t)(uct_ib_md_t *md, uint8_t *mr_id); +/** + * Memory domain method to register crossed mkey for memory area. + * + * @param [in] ib_md Memory domain. + * + * @param [in] address Memory area start address (HOST). + * + * @param [in] length Memory area length (HOST). + * + * @param [in] allowed_gvmi_id Allowed GVMI ID (DPU). + * + * @param [out] ib_memh Memory region handle. + * Method should initialize lkey & rkey. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_crossed_key_func_t)(uct_ib_md_t *ib_md, + void *address, + size_t length, + uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh); + +/** + * Memory domain method to register crossing mkey for memory area. + * + * @param [in] ib_md Memory domain. + * + * @param [in] address Memory area start address (HOST). + * + * @param [in] length Memory area length (HOST). + * + * @param [in] target_gvmi_id Target GVMI ID (HOST). + * + * @param [in] target_mkey Target mkey this mkey refers to (HOST). + * + * @param [out] ib_memh Memory region handle. + * Method should initialize lkey and rkey. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_crossing_key_func_t)(uct_ib_md_t *ib_md, + void *address, + size_t length, + uint32_t target_gvmi_id, + uint32_t target_mkey, + uct_ib_mem_t *ib_memh); + typedef struct uct_ib_md_ops { uct_ib_md_open_func_t open; uct_ib_md_cleanup_func_t cleanup; @@ -331,6 +378,8 @@ typedef struct uct_ib_md_ops { uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded; uct_ib_md_mem_prefetch_func_t mem_prefetch; uct_ib_md_get_atomic_mr_id_func_t get_atomic_mr_id; + uct_ib_md_reg_crossed_key_func_t reg_crossed_key; + uct_ib_md_reg_crossing_key_func_t reg_crossing_key; } uct_ib_md_ops_t; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h index 452c637bddc..d9b471ad679 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h +++ b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h @@ -624,11 +624,12 @@ struct uct_ib_mlx5_query_hca_vport_context_in_bits { }; enum { - UCT_IB_MLX5_MKC_ACCESS_MODE_PA = 0x0, - UCT_IB_MLX5_MKC_ACCESS_MODE_MTT = 0x1, - UCT_IB_MLX5_MKC_ACCESS_MODE_KLMS = 0x2, - UCT_IB_MLX5_MKC_ACCESS_MODE_KSM = 0x3, - UCT_IB_MLX5_MKC_ACCESS_MODE_MEMIC = 0x5 + UCT_IB_MLX5_MKC_ACCESS_MODE_PA = 0x0, + UCT_IB_MLX5_MKC_ACCESS_MODE_MTT = 0x1, + UCT_IB_MLX5_MKC_ACCESS_MODE_KLMS = 0x2, + UCT_IB_MLX5_MKC_ACCESS_MODE_KSM = 0x3, + UCT_IB_MLX5_MKC_ACCESS_MODE_MEMIC = 0x5, + UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA = 0x6 }; struct uct_ib_mlx5_mkc_bits { @@ -636,7 +637,9 @@ struct uct_ib_mlx5_mkc_bits { uint8_t free[0x1]; uint8_t reserved_at_2[0x1]; uint8_t access_mode_4_2[0x3]; - uint8_t reserved_at_6[0x7]; + uint8_t alter_pd_to_vhca_id[0x1]; + uint8_t crossed_side_mkey[0x1]; + uint8_t reserved_at_8[0x5]; uint8_t relaxed_ordering_write[0x1]; uint8_t reserved_at_e[0x1]; uint8_t small_fence_on_rdma_read_response[0x1]; @@ -669,9 +672,15 @@ struct uct_ib_mlx5_mkc_bits { uint8_t bsf_octword_size[0x20]; - uint8_t reserved_at_120[0x80]; + uint8_t reserved_at_120[0x60]; - uint8_t translations_octword_size[0x20]; + uint8_t crossing_target_gvmi_id[0x10]; + uint8_t reserved_at_190[0x10]; + + union { + uint8_t translations_octword_size[0x20]; + uint8_t crossing_target_mkey[0x20]; + }; uint8_t reserved_at_1c0[0x1b]; uint8_t log_entity_size[0x5]; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 46e7f949525..47d460fbf95 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -844,6 +844,132 @@ static void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd) ucs_recursive_spinlock_destroy(&md->dbrec_lock); } +static ucs_status_t +uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; + struct mlx5dv_devx_umem *mem; + struct mlx5dv_devx_obj *mr; + void *mkc; + ucs_status_t status; + + // TODO: check if access flag needs to be 7, 0, UCT_IB_MEM_ACCESS_FLAGS + mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, + UCT_IB_MEM_ACCESS_FLAGS); + if (mem == NULL) { + ucs_error("mlx5dv_devx_umem_reg() failed: %m"); + status = UCS_ERR_NO_MEMORY; + goto err_out; + } + + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); + UCT_IB_MLX5DV_SET(create_mkey_in, in, pg_access, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_valid, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, mem->umem_id); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_MTT); + UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, 12); + UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, (intptr_t)address & 0xff); + UCT_IB_MLX5DV_SET(mkc, mkc, alter_pd_to_vhca_id, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, crossed_side_mkey, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_gvmi_id, allowed_gvmi_id); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)address); + UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); + + mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, + sizeof(out)); + if (mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); + status = UCS_ERR_UNSUPPORTED; + goto err_free; + } + + memh->super.lkey = + (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | + ((intptr_t)address & 0xff); + memh->super.rkey = memh->super.lkey; + + status = UCS_OK; + +err_free: + mlx5dv_devx_umem_dereg(mem); + +err_out: + return status; +} + +static ucs_status_t +uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t target_gvmi_id, + uint32_t target_mkey, uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_obj dv = {}; + struct mlx5dv_devx_obj *mr; + void *mkc; + ucs_status_t status; + + dv.pd.in = md->super.pd; + dv.pd.out = &dvpd; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); + UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, 0); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x3); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_4_2, (UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x1C) >> 2); + UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_gvmi_id, target_gvmi_id); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_mkey, target_mkey); + UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, (intptr_t)address & 0xff); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)0); + UCT_IB_MLX5DV_SET(mkc, mkc, length64, 1); + + mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, + sizeof(out)); + if (mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); + status = UCS_ERR_UNSUPPORTED; + goto err_out; + } + + memh->super.lkey = + (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | + ((intptr_t)address & 0xff); + memh->super.rkey = memh->super.lkey; + + status = UCS_OK; + +err_out: + return status; +} + static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .open = uct_ib_mlx5_devx_md_open, .cleanup = uct_ib_mlx5_devx_md_cleanup, @@ -855,6 +981,8 @@ static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .dereg_multithreaded = uct_ib_mlx5_devx_dereg_multithreaded, .mem_prefetch = uct_ib_mlx5_mem_prefetch, .get_atomic_mr_id = uct_ib_mlx5_md_get_atomic_mr_id, + .reg_crossed_key = uct_ib_mlx5_devx_reg_crossed_key, + .reg_crossing_key = uct_ib_mlx5_devx_reg_crossing_key, }; UCT_IB_MD_OPS(uct_ib_mlx5_devx_md_ops, 2); From 42af845af468ebc868b9ef07057fe2893e9442a3 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 14:58:04 +0200 Subject: [PATCH 02/27] WIP xgvmi cap --- src/uct/api/uct.h | 3 +- src/uct/api/v2/uct_v2.h | 21 ++++++++++++ src/uct/base/uct_md.c | 15 +++++++++ src/uct/base/uct_md.h | 11 +++++++ src/uct/ib/base/ib_md.c | 53 ++++++++++++++++++++++++++++++- src/uct/ib/base/ib_md.h | 1 + src/uct/ib/mlx5/dv/ib_mlx5_ifc.h | 17 ++++++++-- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 10 ++++++ 8 files changed, 127 insertions(+), 4 deletions(-) diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index b7cafa46115..96ba1b497be 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -692,7 +692,8 @@ enum { UCT_MD_FLAG_SOCKADDR = UCS_BIT(7), /**< MD support for client-server connection establishment via sockaddr */ - UCT_MD_FLAG_INVALIDATE = UCS_BIT(8) /**< MD supports memory invalidation */ + UCT_MD_FLAG_INVALIDATE = UCS_BIT(8), /**< MD supports memory invalidation */ + UCT_MD_FLAG_SHARED_RKEY = UCS_BIT(9), /**< MD supports shared remote keys */ }; /** diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h index 918b7115c1d..0ece1f19b8b 100644 --- a/src/uct/api/v2/uct_v2.h +++ b/src/uct/api/v2/uct_v2.h @@ -412,6 +412,27 @@ ucs_status_t uct_ep_query(uct_ep_h ep, uct_ep_attr_t *ep_attr); int uct_iface_is_reachable_v2(uct_iface_h iface, const uct_iface_is_reachable_params_t *params); + +typedef struct { + void *address; + size_t length; + int dest_gvmi; +} uct_md_mem_reg_shared_params_t; + +ucs_status_t uct_md_mem_reg_shared(uct_md_h md, + uct_md_mem_reg_shared_params_t *params, + uct_mem_h *memh_p); + +typedef struct { + int source_gvmi; + uct_rkey_t rkey; +} uct_md_import_shared_rkey_params_t; + +ucs_status_t +uct_md_import_shared_rkey(uct_md_h md, + uct_md_import_shared_rkey_params_t *params, + uct_mem_h *memh_p); + END_C_DECLS #endif diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 3c1f5a6afb8..4e8ec1c45ba 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -468,6 +468,21 @@ ucs_status_t uct_md_mem_dereg_v2(uct_md_h md, return md->ops->mem_dereg(md, params); } +ucs_status_t uct_md_mem_reg_shared(uct_md_h md, + uct_md_mem_reg_shared_params_t *params, + uct_mem_h *memh_p) +{ + return md->ops->mem_reg_shared(md, params, memh_p); +} + +ucs_status_t +uct_md_import_shared_rkey(uct_md_h md, + uct_md_import_shared_rkey_params_t *params, + uct_mem_h *memh_p) +{ + return md->ops->import_shared_rkey(md, params, memh_p); +} + ucs_status_t uct_md_mem_query(uct_md_h md, const void *address, size_t length, uct_md_mem_attr_t *mem_attr) { diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h index 78276acb702..6af66192581 100644 --- a/src/uct/base/uct_md.h +++ b/src/uct/base/uct_md.h @@ -100,6 +100,15 @@ typedef ucs_status_t (*uct_md_mem_dereg_func_t)(uct_md_h md, const uct_md_mem_dereg_params_t *param); + +typedef ucs_status_t (*uct_md_mem_reg_shared_func_t)( + uct_md_h md, uct_md_mem_reg_shared_params_t *params, uct_mem_h *memh_p); + + +typedef ucs_status_t (*uct_md_import_shared_rkey_func_t)( + uct_md_h md, uct_md_import_shared_rkey_params_t *params, uct_mem_h *memh_p); + + typedef ucs_status_t (*uct_md_mem_query_func_t)(uct_md_h md, const void *address, size_t length, @@ -129,6 +138,8 @@ struct uct_md_ops { uct_md_mem_advise_func_t mem_advise; uct_md_mem_reg_func_t mem_reg; uct_md_mem_dereg_func_t mem_dereg; + uct_md_mem_reg_shared_func_t mem_reg_shared; + uct_md_import_shared_rkey_func_t import_shared_rkey; uct_md_mem_query_func_t mem_query; uct_md_mkey_pack_func_t mkey_pack; uct_md_is_sockaddr_accessible_func_t is_sockaddr_accessible; diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index d377f8aee20..9a24942ce95 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -298,7 +298,8 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY | UCT_MD_FLAG_ADVISE | - UCT_MD_FLAG_INVALIDATE; + UCT_MD_FLAG_INVALIDATE | + md->extra_cap_flags; md_attr->cap.alloc_mem_types = 0; md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); md_attr->cap.detect_mem_types = 0; @@ -845,6 +846,50 @@ static ucs_status_t uct_ib_mem_dereg(uct_md_h uct_md, return status; } +static ucs_status_t +uct_ib_md_mem_reg_shared(uct_md_h uct_md, uct_md_mem_reg_shared_params_t *params, + uct_mem_h *memh_p) +{ + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *ib_memh; + ucs_status_t status; + + // return md->ops->mem_reg_shared(md, params, memh_p); + ib_memh = uct_ib_memh_alloc(md); + status = md->ops->reg_crossed_key(md, params->address, params->length, + params->dest_gvmi, ib_memh); + if (status != UCS_OK) { + uct_ib_memh_free(ib_memh); + return status; + } + + *memh_p = ib_memh; + return UCS_OK; +} + +static ucs_status_t +uct_ib_md_import_shared_rkey(uct_md_h uct_md, + uct_md_import_shared_rkey_params_t *params, + uct_mem_h *memh_p) +{ + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *ib_memh; + ucs_status_t status; + + // return md->ops->import_shared_rkey(md, params, memh_p); + ib_memh = uct_ib_memh_alloc(md); + status = md->ops->reg_crossing_key(md, NULL, 0, params->source_gvmi, + uct_ib_md_direct_rkey(params->rkey), + ib_memh); + if (status != UCS_OK) { + uct_ib_memh_free(ib_memh); + return status; + } + + *memh_p = ib_memh; + return UCS_OK; +} + static ucs_status_t uct_ib_verbs_reg_key(uct_ib_md_t *md, void *address, size_t length, uint64_t access_flags, uct_ib_mem_t *ib_memh, @@ -970,6 +1015,8 @@ static uct_md_ops_t uct_ib_md_ops = { .query = uct_ib_md_query, .mem_reg = uct_ib_mem_reg, .mem_dereg = uct_ib_mem_dereg, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .detect_memory_type = ucs_empty_function_return_unsupported, @@ -1041,6 +1088,8 @@ static uct_md_ops_t uct_ib_md_rcache_ops = { .query = uct_ib_md_query, .mem_reg = uct_ib_mem_rcache_reg, .mem_dereg = uct_ib_mem_rcache_dereg, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .is_sockaddr_accessible = ucs_empty_function_return_zero_int, @@ -1155,6 +1204,8 @@ static uct_md_ops_t UCS_V_UNUSED uct_ib_md_global_odp_ops = { .query = uct_ib_md_odp_query, .mem_reg = uct_ib_mem_global_odp_reg, .mem_dereg = uct_ib_mem_global_odp_dereg, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .detect_memory_type = ucs_empty_function_return_unsupported, diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 28ce0c85222..28e00512090 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -136,6 +136,7 @@ typedef struct uct_ib_md { int fork_init; size_t memh_struct_size; uint64_t reg_mem_types; + uint64_t extra_cap_flags; } uct_ib_md_t; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h index d9b471ad679..d290ee525dd 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h +++ b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h @@ -384,9 +384,22 @@ struct uct_ib_mlx5_cmd_hca_cap_bits { uint8_t reserved_at_500[0x20]; uint8_t num_of_uars_per_page[0x20]; - uint8_t reserved_at_540[0x40]; - uint8_t reserved_at_580[0x3d]; + uint8_t flex_parser_protocols[0x20]; + + uint8_t reserved_at_560[0x13]; + uint8_t log_max_guaranteed_connections[0x5]; + uint8_t reserved_at_578[0x3]; + uint8_t log_max_dct_connections[0x5]; + + uint8_t log_max_atomic_size_qp[0x8]; + uint8_t reserved_at_588[0x10]; + uint8_t log_max_atomic_size_dc[0x8]; + + uint8_t reserved_at_5a0[0x12]; + uint8_t crossing_vhca_mkey[0x1]; + uint8_t reserved_at_5b3[0x9]; + uint8_t mini_cqe_resp_stride_index[0x1]; uint8_t cqe_128_always[0x1]; uint8_t cqe_compression_128[0x1]; uint8_t cqe_compression[0x1]; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 47d460fbf95..9c4f4024d3d 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -744,6 +744,16 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, md->flags |= UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG; } + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, + crossing_vhca_mkey)) { + ucs_warn("%s: crossing_vhca_mkey is supported", + uct_ib_device_name(dev)); + md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; + } else { + ucs_warn("%s: crossing_vhca_mkey is not supported", + uct_ib_device_name(dev)); + } + status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); if (status != UCS_OK) { goto err_free; From 38d13de4581172628be008dd11d50b7b8bbd39ce Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 16:19:26 +0200 Subject: [PATCH 03/27] WIP --- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 20 ++++++++++---- test/gtest/uct/test_md.cc | 44 +++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 9c4f4024d3d..5a7cf8d00b4 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -621,6 +621,7 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, struct ibv_context *ctx; uct_ib_device_t *dev; uct_ib_mlx5_md_t *md; + int vhca_id; void *cap; int ret; @@ -744,14 +745,21 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, md->flags |= UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG; } + vhca_id = UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id); + // ucs_warn("%s: vhca_id is %d. at b4h: 0x%x; 0x%x", uct_ib_device_name(dev), + // UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id), + // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)), + // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)) & (1<<13) + // ); + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, crossing_vhca_mkey)) { - ucs_warn("%s: crossing_vhca_mkey is supported", - uct_ib_device_name(dev)); + ucs_print("%s: vhca_id=%d crossing_vhca_mkey is supported", + uct_ib_device_name(dev), vhca_id); md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; } else { - ucs_warn("%s: crossing_vhca_mkey is not supported", - uct_ib_device_name(dev)); + ucs_print("%s: vhca_id=%d crossing_vhca_mkey is not supported", + uct_ib_device_name(dev), vhca_id); } status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); @@ -869,8 +877,10 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, ucs_status_t status; // TODO: check if access flag needs to be 7, 0, UCT_IB_MEM_ACCESS_FLAGS + ucs_warn("ume_reg crosses address=%p length=%zu", address, length); mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, - UCT_IB_MEM_ACCESS_FLAGS); + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); if (mem == NULL) { ucs_error("mlx5dv_devx_umem_reg() failed: %m"); status = UCS_ERR_NO_MEMORY; diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index 3bdc2dcb37e..b29b16d2fc5 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -754,6 +754,50 @@ UCS_TEST_SKIP_COND_P(test_md, dereg_bad_arg, free(ptr); } +// TODO check MD cap flag +UCS_TEST_P(test_md, shared_rkey) +{ + static const size_t size = 1 * UCS_MBYTE; + ucs_status_t status; + uct_mem_h memh; + void *ptr; + + int ret = ucs_posix_memalign(&ptr, ucs_get_page_size(), size, "shared_buf"); + ASSERT_EQ(0, ret); + + uct_md_mem_reg_shared_params_t reg_shared_params; + reg_shared_params.address = ptr; + reg_shared_params.length = size; + reg_shared_params.dest_gvmi = 0; + + status = uct_md_mem_reg_shared(md(), ®_shared_params, &memh); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "registered shared memh"; + + std::vector rkey_buf; + rkey_buf.resize(md_attr().rkey_packed_size); + + status = uct_md_mkey_pack(md(), memh, &rkey_buf[0]); + ASSERT_UCS_OK(status); + + uct_rkey_bundle_t rkey_bundle; + status = uct_rkey_unpack(GetParam().component, &rkey_buf[0], &rkey_bundle); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "unpacked rkey"; + + uct_md_import_shared_rkey_params_t import_params; + import_params.rkey = rkey_bundle.rkey; + import_params.source_gvmi = 0; // TODO + + uct_mem_h imported_memh; + status = uct_md_import_shared_rkey(md(), &import_params, &imported_memh); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "registered imported memh"; +} + UCT_MD_INSTANTIATE_TEST_CASE(test_md) class test_md_fork : private ucs::clear_dontcopy_regions, public test_md { From 4456a6297edf362b2a181894020e09450ef7acdf Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 17:50:25 +0200 Subject: [PATCH 04/27] WIP2 --- src/uct/api/uct.h | 2 +- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 96ba1b497be..509f0bba3cd 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -693,7 +693,7 @@ enum { connection establishment via sockaddr */ UCT_MD_FLAG_INVALIDATE = UCS_BIT(8), /**< MD supports memory invalidation */ - UCT_MD_FLAG_SHARED_RKEY = UCS_BIT(9), /**< MD supports shared remote keys */ + UCT_MD_FLAG_SHARED_RKEY = UCS_BIT(9) /**< MD supports shared remote keys */ }; /** diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 5a7cf8d00b4..856bdf3e165 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -973,7 +973,7 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, sizeof(out)); if (mr == NULL) { - ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + ucs_fatal("mlx5dv_devx_obj_create() failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); status = UCS_ERR_UNSUPPORTED; goto err_out; From 4f411ce05b4279d8999b843e729b7794e94a4840 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 18:27:39 +0200 Subject: [PATCH 05/27] UCP: cross-gvmi draft --- src/ucp/api/ucp.h | 12 ++- src/ucp/core/ucp_mm.c | 145 +++++++++++++++++++++++++++------- src/ucp/core/ucp_mm.h | 1 + src/ucp/core/ucp_mm.inl | 19 ++++- src/ucp/core/ucp_request.inl | 5 +- src/ucp/core/ucp_rkey.c | 30 ++++--- src/ucp/core/ucp_rkey.h | 1 + src/ucp/dt/datatype_iter.c | 2 +- src/ucp/dt/datatype_iter.inl | 2 +- src/ucp/dt/dt.h | 1 + src/ucp/rndv/rndv.c | 5 +- test/gtest/ucp/test_ucp_am.cc | 113 ++++++++++++++++++++++++++ 12 files changed, 290 insertions(+), 46 deletions(-) diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index 801bd6052e1..87a5a8f95b6 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -392,7 +392,9 @@ enum ucp_mem_map_params_field { @ref ucp_mem_map routine.*/ UCP_MEM_MAP_PARAM_FIELD_FLAGS = UCS_BIT(2), /**< Allocation flags. */ UCP_MEM_MAP_PARAM_FIELD_PROT = UCS_BIT(3), /**< Memory protection mode. */ - UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE = UCS_BIT(4) /**< Memory type. */ + UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE = UCS_BIT(4), /**< Memory type. */ + UCP_MEM_MAP_PARAM_FIELD_PEER_ID = UCS_BIT(5), /**< peer_id field */ + UCP_MEM_MAP_PARAM_FIELD_RKEY = UCS_BIT(6) /**< rkey field */ }; /** @@ -530,10 +532,11 @@ enum { if passed address is not a null-pointer then it will be used as a hint or direct address for allocation. */ - UCP_MEM_MAP_FIXED = UCS_BIT(2) /**< Don't interpret address as a hint: + UCP_MEM_MAP_FIXED = UCS_BIT(2), /**< Don't interpret address as a hint: place the mapping at exactly that address. The address must be a multiple of the page size. */ + UCP_MEM_MAP_SHARED = UCS_BIT(3) }; @@ -1587,6 +1590,11 @@ typedef struct ucp_mem_map_params { * internally. */ ucs_memory_type_t memory_type; + + /* Id of the peer to create a shared memh for */ + uint32_t peer_id; + + ucp_rkey_h rkey; } ucp_mem_map_params_t; diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 42eddea4531..1eed06c3158 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -272,6 +272,10 @@ ucp_mem_map_params2uct_flags(const ucp_mem_map_params_t *params) if (params->flags & UCP_MEM_MAP_FIXED) { flags |= UCT_MD_MEM_FLAG_FIXED; } + + if (params->flags & UCP_MEM_MAP_SHARED) { + flags |= UCT_MD_MEM_FLAG_SHARED_MEMH; + } } flags |= UCT_MD_MEM_ACCESS_ALL; @@ -307,36 +311,61 @@ void ucp_memh_dereg(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map) } } +static void ucp_memh_cleanup(ucp_context_h context, ucp_mem_h memh, + ucp_md_map_t md_map_registered, void *address, + size_t length, ucp_md_index_t md_index, + unsigned uct_flags, ucs_status_t status) +{ + int shared_memh = uct_flags & UCT_MD_MEM_FLAG_SHARED_MEMH; + ucs_log_level_t log_level; + + log_level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? + UCS_LOG_LEVEL_DIAG : UCS_LOG_LEVEL_ERROR; + + ucs_log(log_level, + "failed to register %s%p length %zu on md[%d]=%s: %s", + (shared_memh ? "shared memh " : ""), address, length, md_index, + context->tl_mds[md_index].rsc.md_name, + ucs_status_string(status)); + + ucp_memh_dereg(context, memh, md_map_registered); + + if (context->rcache != NULL) { + ucs_rcache_region_put(context->rcache, &memh->super); + } else { + ucs_free(memh); + } + + return status; + +} + static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map, void *address, size_t length, unsigned uct_flags) { ucp_md_map_t md_map_registered = 0; + uct_md_attr_t *md_attr; ucs_log_level_t log_level; ucp_md_index_t md_index; ucs_status_t status; ucs_for_each_bit(md_index, md_map) { - status = uct_md_mem_reg(context->tl_mds[md_index].md, - address, length, uct_flags, - &memh->uct[md_index]); - if (ucs_unlikely(status != UCS_OK)) { - log_level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? - UCS_LOG_LEVEL_DIAG : UCS_LOG_LEVEL_ERROR; - ucs_log(log_level, - "failed to register %p length %zu on md[%d]=%s: %s", - address, length, md_index, - context->tl_mds[md_index].rsc.md_name, - ucs_status_string(status)); - - ucp_memh_dereg(context, memh, md_map_registered); + md_attr = &context->tl_mds[md_index].attr; - if (context->rcache != NULL) { - ucs_rcache_region_put(context->rcache, &memh->super); - } else { - ucs_free(memh); - } + if (uct_flags & UCT_MD_MEM_FLAG_SHARED_MEMH) { + status = uct_md_mem_reg_shared(context->tl_mds[md_index].md, + address, length, uct_flags, peer_id, + &memh->uct[md_index]); + } else { + status = uct_md_mem_reg(context->tl_mds[md_index].md, + address, length, uct_flags, + &memh->uct[md_index]); + } + if (ucs_unlikely(status != UCS_OK)) { + ucp_memh_cleanup(context, memh, md_map_registered, address, length, + md_index, uct_flags, status); return status; } @@ -383,6 +412,7 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, memh->super.super.end = (uintptr_t)reg_address + reg_length; memh->alloc_md_index = UCP_NULL_RESOURCE; memh->alloc_method = UCT_ALLOC_METHOD_LAST; + memh->flags = 0; } else { status = ucs_rcache_get(context->rcache, reg_address, reg_length, PROT_READ|PROT_WRITE, NULL, &rregion); @@ -410,7 +440,7 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, static ucs_status_t ucp_memh_alloc(ucp_context_h context, void *address, size_t length, ucs_memory_type_t memory_type, unsigned uct_flags, - const char *alloc_name, ucp_mem_h *memh_p) + const char *alloc_name, ucp_rkey_h rkey, ucp_mem_h *memh_p) { ucp_md_map_t reg_md_map = context->reg_md_map[memory_type]; ucp_md_index_t alloc_md_index = UCP_NULL_RESOURCE; @@ -437,8 +467,13 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, ucs_assert(alloc_md_index != UCP_NULL_RESOURCE); } - status = ucp_memh_get_slow(context, mem.address, mem.length, - mem.mem_type, reg_md_map, uct_flags, &memh); + if (rkey == NULL) { + status = ucp_memh_get_slow(context, mem.address, mem.length, + mem.mem_type, reg_md_map, uct_flags, &memh); + } else { + status = ucp_memh_import(context, rkey, address, length, memh); + } + if (status != UCS_OK) { goto err; } @@ -448,9 +483,9 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, memh->alloc_md_index = alloc_md_index; memh->uct[alloc_md_index] = mem.memh; memh->md_map |= UCS_BIT(alloc_md_index); - ucs_trace("allocated address %p length %zu on md[%d]=%s %p", + ucs_trace("allocated address %p length %zu on md[%d]=%s rkey=%p %p", mem.address, mem.length, alloc_md_index, - context->tl_mds[alloc_md_index].rsc.md_name, + context->tl_mds[alloc_md_index].rsc.md_name, rkey, memh->uct[alloc_md_index]); } @@ -462,6 +497,44 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, return status; } +static ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, + address, length, ucp_memh_h memh) +{ + ucp_md_map_t md_map_registered = 0; + uct_md_attr_t *md_attr; + ucp_md_index_t md_index; + ucs_status_t status; + + ucs_assert_always(rkey->peer_id != UCP_NULL_RESOURCE); + + ucs_for_each_bit(md_index, rkey->md_map) { + md_attr = &context->tl_mds[md_index].attr; + + ucs_assert_always(md_attr->cap.flags & UCT_MD_FLAG_SHARED_MEMH); + + status = uct_md_memh_import(context->tl_mds[md_index].md, + address, length, rkey->peer_id, + rkey->tl_rkey[md_index], + &memh->uct[md_index]); + + if (ucs_unlikely(status != UCS_OK)) { + ucp_memh_cleanup(context, memh, md_map_registered, address, length, + md_index, 0, status); + return status; + } + + ucs_trace("registered address %p length %zu on md[%d]=%s %p", + address, length, md_index, + context->tl_mds[md_index].rsc.md_name, + memh->uct[md_index]); + md_map_registered |= UCS_BIT(md_index); + } + + memh->md_map |= md_map_registered; + + return UCS_OK; +} + /* Matrix of behavior * |--------------------------------------------------------------------------------| * | parameter | value | @@ -482,6 +555,7 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para ucs_status_t status; unsigned flags; void *address; + ucp_rkey_h rkey; /* always acquire context lock */ UCP_THREAD_CS_ENTER(&context->mt_lock); @@ -495,6 +569,7 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para address = UCP_PARAM_VALUE(MEM_MAP, params, address, ADDRESS, NULL); flags = UCP_PARAM_VALUE(MEM_MAP, params, flags, FLAGS, 0); + rkey = UCP_PARAM_VALUE(MEM_MAP, params, rkey, RKEY, NULL); if ((flags & UCP_MEM_MAP_FIXED) && ((uintptr_t)address % ucs_get_page_size())) { @@ -523,6 +598,19 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para goto out; } + if (flags & UCP_MEM_MAP_SHARED) { + if (!(params->field_mask & UCP_MEM_MAP_PARAM_FIELD_PEER_ID)) { + ucs_error("UCP_MEM_MAP_SHARED flags requires peer id to be specified"); + status = UCS_ERR_INVALID_PARAM; + goto out; + } + if (params->field_mask & UCP_MEM_MAP_PARAM_FIELD_RKEY) { + ucs_error("Failed to import non-exported shared memh"); + status = UCS_ERR_INVALID_PARAM; + goto out; + } + } + if (flags & UCP_MEM_MAP_ALLOCATE) { memory_type = UCP_PARAM_VALUE(MEM_MAP, params, memory_type, MEMORY_TYPE, UCS_MEMORY_TYPE_HOST); @@ -543,11 +631,12 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para if (ucp_mem_map_is_allocate(params)) { status = ucp_memh_alloc(context, address, params->length, memory_type, ucp_mem_map_params2uct_flags(params), - "user memory", memh_p); + "user memory", rkey, memh_p); } else { status = ucp_memh_get(context, address, params->length, memory_type, context->reg_md_map[memory_type], - ucp_mem_map_params2uct_flags(params), memh_p); + ucp_mem_map_params2uct_flags(params), + rkey, memh_p); } out: @@ -733,7 +822,7 @@ ucp_mpool_malloc(ucp_worker_h worker, ucs_mpool_t *mp, size_t *size_p, void **ch status = ucp_memh_alloc(worker->context, NULL, *size_p + sizeof(*chunk_hdr), UCS_MEMORY_TYPE_HOST, ucp_mem_map_params2uct_flags(&mem_params), - ucs_mpool_name(mp), &memh); + ucs_mpool_name(mp), NULL, &memh); if (status != UCS_OK) { goto out; } @@ -784,7 +873,7 @@ ucp_rndv_frag_malloc_mpools(ucs_mpool_t *mp, size_t *size_p, void **chunk_p) /* payload; need to get default flags from ucp_mem_map_params2uct_flags() */ status = ucp_memh_alloc(context, NULL, frag_size * num_elems, mem_type, - UCT_MD_MEM_ACCESS_RMA, ucs_mpool_name(mp), + UCT_MD_MEM_ACCESS_RMA, ucs_mpool_name(mp), NULL, &chunk_hdr->memh); if (status != UCS_OK) { return status; @@ -858,7 +947,7 @@ ucp_mm_get_alloc_md_map(ucp_context_h context, ucp_md_map_t *md_map_p) /* Allocate dummy 1-byte buffer to get the expected md_map */ status = ucp_memh_alloc(context, NULL, 1, UCS_MEMORY_TYPE_HOST, UCT_MD_MEM_ACCESS_ALL, "get_alloc_md_map", - &memh); + NULL, &memh); if (status != UCS_OK) { goto out; } diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index a84e695a36f..dfe81b29bb5 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -31,6 +31,7 @@ typedef struct ucp_mem { ucs_memory_type_t mem_type; /* Type of allocated or registered memory */ ucp_md_index_t alloc_md_index; /* Index of MD used to allocated the memory */ ucp_md_map_t md_map; /* Which MDs have valid memory handles */ + ucp_rsc_index_t peer_id; /* Peer id for shared memh */ uct_mem_h uct[0]; /* Sparse memory handles array num_mds in size */ } ucp_mem_t; diff --git a/src/ucp/core/ucp_mm.inl b/src/ucp/core/ucp_mm.inl index cab91ccf945..2ad8cbb61ae 100644 --- a/src/ucp/core/ucp_mm.inl +++ b/src/ucp/core/ucp_mm.inl @@ -19,7 +19,7 @@ ucp_memh_is_zero_length(const ucp_mem_h memh) static UCS_F_ALWAYS_INLINE ucs_status_t ucp_memh_get(ucp_context_h context, void *address, size_t length, ucs_memory_type_t mem_type, ucp_md_map_t reg_md_map, - unsigned uct_flags, ucp_mem_h *memh_p) + unsigned uct_flags, ucp_rkey_h rkey, ucp_mem_h *memh_p) { ucs_rcache_region_t *rregion; ucs_status_t status; @@ -30,6 +30,23 @@ ucp_memh_get(ucp_context_h context, void *address, size_t length, return UCS_OK; } + if (rkey_buffer != NULL) { + /* Cache is not supported for shared mkeys */ + memh = ucs_malloc(sizeof(*memh) + context->num_mds * sizeof(memh->uct[0]), + "ucp_import_memh"); + if (memh == NULL) { + return UCS_ERR_NO_MEMORY; + } + + status = ucp_memh_import(context, rkey, address, length, memh); + if (status != UCS_OK) { + ucs_free(memh); + return status; + } + *memh_p = memh; + return UCS_OK; + } + if (ucs_likely(context->rcache != NULL)) { status = ucs_rcache_get(context->rcache, address, length, PROT_READ|PROT_WRITE, NULL, &rregion); diff --git a/src/ucp/core/ucp_request.inl b/src/ucp/core/ucp_request.inl index 1b8bdfb4a3a..7a49c862acc 100644 --- a/src/ucp/core/ucp_request.inl +++ b/src/ucp/core/ucp_request.inl @@ -604,8 +604,9 @@ ucp_request_init_dt_reg_from_memh(ucp_request_t *req, ucp_md_map_t md_map, ucs_assert((dt_reg == &req->send.state.dt.dt.contig) || (dt_reg == &req->recv.state.dt.contig)); - req->flags |= UCP_REQUEST_FLAG_USER_MEMH; - memh_index = 0; + req->flags |= UCP_REQUEST_FLAG_USER_MEMH; + memh_index = 0; + dt_reg->peer_id = memh->peer_id; ucs_for_each_bit(md_index, memh->md_map) { if (md_map & UCS_BIT(md_index)) { dt_reg->memh[memh_index++] = memh->uct[md_index]; diff --git a/src/ucp/core/ucp_rkey.c b/src/ucp/core/ucp_rkey.c index 91cd8d1ea87..99b1f1c82b5 100644 --- a/src/ucp/core/ucp_rkey.c +++ b/src/ucp/core/ucp_rkey.c @@ -56,6 +56,9 @@ size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map, size = sizeof(ucp_md_map_t); /* Memory domains map */ size += sizeof(uint8_t); /* Memory type */ + /* Always include shared key info for now */ + size += sizeof(uint32_t); /* Peer id */ + ucs_for_each_bit(md_index, md_map) { tl_rkey_size = context->tl_mds[md_index].attr.rkey_packed_size; ucs_assert_always(tl_rkey_size <= UINT8_MAX); @@ -70,6 +73,7 @@ size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map, size += ucs_popcount(sys_dev_map) * sizeof(ucp_rkey_packed_distance_t); } + return size; } @@ -121,7 +125,7 @@ ucp_rkey_pack_common(ucp_context_h context, ucp_md_map_t md_map, const uct_mem_h *memh, const ucp_memory_info_t *mem_info, ucp_sys_dev_map_t sys_dev_map, const ucs_sys_dev_distance_t *sys_distance, void *buffer, - int sparse_memh) + int sparse_memh, ucp_rsc_index_t peer_id) { void *p = buffer; unsigned md_index, uct_memh_index; @@ -135,13 +139,16 @@ ucp_rkey_pack_common(ucp_context_h context, ucp_md_map_t md_map, /* Check that md_map is valid */ ucs_assert(ucs_test_all_flags(UCS_MASK(context->num_mds), md_map)); - ucs_trace("packing rkey type %s md_map 0x%" PRIx64 "dev_map 0x%" PRIx64, - ucs_memory_type_names[mem_info->type], md_map, sys_dev_map); + ucs_trace("packing rkey type %s md_map 0x%" PRIx64 "dev_map 0x%" PRIx64 + "peer id 0x%x", + ucs_memory_type_names[mem_info->type], md_map, sys_dev_map, + peer_id); ucs_log_indent(1); UCS_STATIC_ASSERT(UCS_MEMORY_TYPE_LAST <= 255); - *ucs_serialize_next(&p, ucp_md_map_t) = md_map; - *ucs_serialize_next(&p, uint8_t) = mem_info->type; + *ucs_serialize_next(&p, ucp_md_map_t) = md_map; + *ucs_serialize_next(&p, uint8_t) = mem_info->type; + *ucs_serialize_next(&p, ucp_rsc_index_t) = peer_id; /* Write both size and rkey_buffer for each UCT rkey */ uct_memh_index = 0; @@ -189,14 +196,15 @@ ucp_rkey_pack_common(ucp_context_h context, ucp_md_map_t md_map, UCS_PROFILE_FUNC(ssize_t, ucp_rkey_pack_uct, (context, md_map, memh, mem_info, sys_dev_map, sys_distance, - buffer), + buffer, peer_id), ucp_context_h context, ucp_md_map_t md_map, const uct_mem_h *memh, const ucp_memory_info_t *mem_info, ucp_sys_dev_map_t sys_dev_map, - const ucs_sys_dev_distance_t *sys_distance, void *buffer) + const ucs_sys_dev_distance_t *sys_distance, void *buffer, + ucp_rsc_index_t peer_id) { return ucp_rkey_pack_common(context, md_map, memh, mem_info, - sys_dev_map, sys_distance, buffer, 0); + sys_dev_map, sys_distance, buffer, 0, peer_id); } UCS_PROFILE_FUNC(ssize_t, ucp_rkey_pack_memh, @@ -208,7 +216,8 @@ UCS_PROFILE_FUNC(ssize_t, ucp_rkey_pack_memh, const ucs_sys_dev_distance_t *sys_distance, void *buffer) { return ucp_rkey_pack_common(context, md_map, memh->uct, mem_info, - sys_dev_map, sys_distance, buffer, 1); + sys_dev_map, sys_distance, buffer, 1, + memh->peer_id); } ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, @@ -245,6 +254,8 @@ ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, mem_info.type = memh->mem_type; mem_info.sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN; + + packed_size = ucp_rkey_pack_memh(context, memh->md_map, memh, &mem_info, 0, NULL, rkey_buffer); if (packed_size < 0) { @@ -402,6 +413,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack_internal, rkey->md_map = md_map; rkey->mem_type = *ucs_serialize_next(&p, const uint8_t); + rkey->peer_id = *ucs_serialize_next(&p, ucp_rsc_index_t); rkey->flags = flags; #if ENABLE_PARAMS_CHECK rkey->ep = ep; diff --git a/src/ucp/core/ucp_rkey.h b/src/ucp/core/ucp_rkey.h index 8345fd631ba..9d5d8c6d540 100644 --- a/src/ucp/core/ucp_rkey.h +++ b/src/ucp/core/ucp_rkey.h @@ -113,6 +113,7 @@ typedef struct ucp_rkey { ucp_ep_h ep; #endif ucp_md_map_t md_map; /* Which *remote* MDs have valid memory handles */ + ucp_rsc_index_t peer_id; /* Peer id if rkey is shared, NULL_RESOURCE otherwise */ ucp_tl_rkey_t tl_rkey[0]; /* UCT rkey for every remote MD */ } ucp_rkey_t; diff --git a/src/ucp/dt/datatype_iter.c b/src/ucp/dt/datatype_iter.c index 97f54a25e80..dcb89fc60cc 100644 --- a/src/ucp/dt/datatype_iter.c +++ b/src/ucp/dt/datatype_iter.c @@ -55,7 +55,7 @@ ucs_status_t ucp_datatype_iter_iov_mem_reg(ucp_context_h context, iov = ucp_datatype_iter_iov_at(dt_iter, iov_index); status = ucp_memh_get(context, iov->buffer, iov->length, dt_iter->mem_info.type, md_map, uct_flags, - &iov_memh[iov_index]); + NULL, &iov_memh[iov_index]); if (status != UCS_OK) { ucp_datatype_iter_iov_mem_dereg(context, dt_iter); return status; diff --git a/src/ucp/dt/datatype_iter.inl b/src/ucp/dt/datatype_iter.inl index 0216e90d25b..94171bcdc89 100644 --- a/src/ucp/dt/datatype_iter.inl +++ b/src/ucp/dt/datatype_iter.inl @@ -501,7 +501,7 @@ ucp_datatype_iter_mem_reg(ucp_context_h context, ucp_datatype_iter_t *dt_iter, return ucp_memh_get(context, dt_iter->type.contig.buffer, dt_iter->length, (ucs_memory_type_t)dt_iter->mem_info.type, md_map, - uct_flags, &dt_iter->type.contig.memh); + uct_flags, NULL, &dt_iter->type.contig.memh); } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_IOV, dt_mask)) { return ucp_datatype_iter_iov_mem_reg(context, dt_iter, md_map, uct_flags); } else { diff --git a/src/ucp/dt/dt.h b/src/ucp/dt/dt.h index 50a3061ceed..cfd48fab920 100644 --- a/src/ucp/dt/dt.h +++ b/src/ucp/dt/dt.h @@ -28,6 +28,7 @@ typedef enum ucp_dt_type ucp_dt_class_t; typedef struct ucp_dt_reg { ucp_md_map_t md_map; /* Map of used memory domains */ uct_mem_h memh[UCP_MAX_OP_MDS]; + ucp_rsc_index_t peer_id; } ucp_dt_reg_t; diff --git a/src/ucp/rndv/rndv.c b/src/ucp/rndv/rndv.c index 287fa61b1b3..492cda5e45a 100644 --- a/src/ucp/rndv/rndv.c +++ b/src/ucp/rndv/rndv.c @@ -158,7 +158,7 @@ size_t ucp_rndv_rts_pack(ucp_request_t *sreq, ucp_rndv_rts_hdr_t *rndv_rts_hdr, packed_rkey_size = ucp_rkey_pack_uct( worker->context, sreq->send.state.dt.dt.contig.md_map, sreq->send.state.dt.dt.contig.memh, &mem_info, 0, NULL, - rkey_buf); + rkey_buf, sreq->send.state.dt.dt.contig.peer_id); if (packed_rkey_size < 0) { ucs_fatal("failed to pack rendezvous remote key: %s", ucs_status_string((ucs_status_t)packed_rkey_size)); @@ -200,7 +200,8 @@ static size_t ucp_rndv_rtr_pack(void *dest, void *arg) rreq->recv.state.dt.contig.md_map, rreq->recv.state.dt.contig.memh, &mem_info, 0, NULL, - rndv_rtr_hdr + 1); + rndv_rtr_hdr + 1, + rreq->recv.state.dt.contig.peer_id); if (packed_rkey_size < 0) { return packed_rkey_size; } diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index 0b68db8faec..6ddd6a0a6a9 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -1283,6 +1283,59 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { return UCS_INPROGRESS; } + ucp_mem_h alloc_memhs(size_t length, ucp_mem_h *exp_memh, ucp_mem_h *imp_memh) + { + ucp_memh memh; + ucp_mem_map_params_t mparams; + mparams.field_mask = UCP_MEM_MAP_PARAM_FIELD_LENGTH | + UCP_MEM_MAP_PARAM_FIELD_FLAGS; + mparams.address = NULL; + mparams.length = length; + mparams.flags = UCP_MEM_MAP_ALLOCATE | UCP_MEM_MAP_SHARED; + ASSERT_UCS_OK(ucp_mem_map(sender().ucph(), &mparams, &memh)); + + // Get address and length of the allocated buffer + ucp_mem_attr_t attr; + attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; + ucs_status_t status = ucp_mem_query(memh, &attr); + if (status != UCS_OK) { + ucp_mem_unmap(sender().ucph(), *memh_p); + ASSERT_TRUE(false); + } + ASSERT_GE(attr.length, length); + *exp_memh = memh; + + // Pack and unpack the key emulating that it is traversing thru the network + void *rkey_buf; + size_t rkey_buf_size; + ASSERT_UCS_OK(ucp_rkey_pack(sender().ucph(), memh, &rkey_buf, + &rkey_buf_size)); + + ASSERT_UCS_OK(ucp_rkey_unpack(sender().ep(), rkey_buf, &mparams.rkey)); + + ucp_rkey_buffer_release(rkey_buf); + + mparams.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | + UCP_MEM_MAP_PARAM_FIELD_LENGTH | + UCP_MEM_MAP_PARAM_FIELD_RKEY; + mparams.address = attr.address; + mparams.length = attr.length; + ASSERT_UCS_OK(ucp_mem_map(sender().ucph(), &mparams, &memh)); + + // Should be safe to destroy rkey now + ucp_rkey_destroy(mparams.rkey); + + *imp_memh = memh; + return attr.address; + } + + void free_memhs(ucp_context_h context, ucp_mem_h exp_memh, + ucp_mem_h imp_memh) + { + ASSERT_UCS_OK(ucp_mem_unmap(context, exp_memh)); + ASSERT_UCS_OK(ucp_mem_unmap(context, imp_memh)); + } + static ucs_status_t am_data_reject_rndv_cb(void *arg, const void *header, size_t header_length, void *data, size_t length, @@ -1328,7 +1381,44 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { return UCS_OK; } + static void am_data_rndv_recv_cb(void *request, ucs_status_t status, + size_t length, void *user_data) + { + test_ucp_am_nbx_rndv *self = reinterpret_cast(user_data); + + EXPECT_UCS_OK(status); + self->m_am_received = true; + + free_memhs(receiver().ucp(), m_rx_memh, m_imp_memh); + } + + static ucs_status_t am_data_rx_shared_mkey_rndv_cb( + void *arg, const void *header, size_t header_length, + void *data, size_t length, + const ucp_am_recv_param_t *param) + { + test_ucp_am_nbx_rndv *self = reinterpret_cast(arg); + + EXPECT_FALSE(self->m_am_received); + + void *address = allocate_imported_mkey(length, &m_rx_memh, &m_imp_memh); + + ucp_request_param_t param; + param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH | + UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL; + param.memh = memh; + params.cb.recv_am = am_data_recv_cb; + ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(receiver().worker(), + data_desc, + address, length, ¶m); + ucp_request_release(rptr); + + return UCS_INPROGRESS; + } + ucs_status_t m_status; + ucp_mem_h m_imp_memh; }; UCS_TEST_P(test_ucp_am_nbx_rndv, rndv_auto, "RNDV_SCHEME=auto") @@ -1429,6 +1519,29 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, reject_rndv) } } +UCS_TEST_P(test_ucp_am_nbx_rndv, shared_mkey) +{ + skip_loopback(); + + set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_rx_shared_mkey_rndv_cb, + this); + ucp_mem_h exp_memh, imp_memh; + m_am_received = false; + size_t length = 512 * UCS_KBYTE; + void *address = alloc_memhs(length, &exp_memh, &imp_memh); + + ucp_request_param_t param; + param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH; + param.memh = imp_memh; + ucs_status_ptr_t sptr = ucp_am_send_nbx(sender().ep(), TEST_AM_NBX_ID, + NULL, 0ul, address, length, ¶m); + + EXPECT_EQ(m_status, request_wait(sptr)); + EXPECT_TRUE(m_am_received); + + free_memhs(sender().ucph(), exp_memh, imp_memh); +} + UCS_TEST_P(test_ucp_am_nbx_rndv, deferred_reject_rndv) { skip_loopback(); From 47d609018dbaf307eab4b07b4330da1013fe6de2 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 16:41:39 +0000 Subject: [PATCH 06/27] WIP3 --- src/uct/ib/base/ib_md.h | 1 + src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 20 ++++++++++++-------- test/apps/Makefile.am | 7 +++++++ test/gtest/uct/test_md.cc | 10 +++++++--- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 28e00512090..6fd7c109027 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -137,6 +137,7 @@ typedef struct uct_ib_md { size_t memh_struct_size; uint64_t reg_mem_types; uint64_t extra_cap_flags; + int vhca_id; } uct_ib_md_t; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 856bdf3e165..45bd126c840 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -621,7 +621,6 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, struct ibv_context *ctx; uct_ib_device_t *dev; uct_ib_mlx5_md_t *md; - int vhca_id; void *cap; int ret; @@ -745,7 +744,7 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, md->flags |= UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG; } - vhca_id = UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id); + md->super.vhca_id = UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id); // ucs_warn("%s: vhca_id is %d. at b4h: 0x%x; 0x%x", uct_ib_device_name(dev), // UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id), // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)), @@ -754,12 +753,12 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, crossing_vhca_mkey)) { - ucs_print("%s: vhca_id=%d crossing_vhca_mkey is supported", - uct_ib_device_name(dev), vhca_id); + ucs_info("%s: vhca_id=%d crossing_vhca_mkey is supported", + uct_ib_device_name(dev), md->super.vhca_id); md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; } else { - ucs_print("%s: vhca_id=%d crossing_vhca_mkey is not supported", - uct_ib_device_name(dev), vhca_id); + ucs_info("%s: vhca_id=%d crossing_vhca_mkey is not supported", + uct_ib_device_name(dev), md->super.vhca_id); } status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); @@ -876,8 +875,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, void *mkc; ucs_status_t status; - // TODO: check if access flag needs to be 7, 0, UCT_IB_MEM_ACCESS_FLAGS - ucs_warn("ume_reg crosses address=%p length=%zu", address, length); + ucs_print("umr_reg crosses address=%p length=%zu", address, length); mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); @@ -923,6 +921,8 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, ((intptr_t)address & 0xff); memh->super.rkey = memh->super.lkey; + ucs_print("crossed mkey is %x", memh->super.lkey); + status = UCS_OK; err_free: @@ -951,6 +951,8 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + ucs_print("reg_crossin address=%p", address); + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); @@ -984,6 +986,8 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, ((intptr_t)address & 0xff); memh->super.rkey = memh->super.lkey; + ucs_print("crossing mkey is %x", memh->super.lkey); + status = UCS_OK; err_out: diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am index 6872042f249..d9c38d8e39e 100644 --- a/test/apps/Makefile.am +++ b/test/apps/Makefile.am @@ -64,6 +64,13 @@ test_init_mt_CPPFLAGS = $(BASE_CPPFLAGS) test_init_mt_CFLAGS = $(BASE_CFLAGS) $(OPENMP_CFLAGS) test_init_mt_LDADD = $(top_builddir)/src/ucp/libucp.la + +test_uct_xgvmi_SOURCES = test_uct_xgvmi.c +test_uct_xgvmi_CPPFLAGS = $(BASE_CPPFLAGS) +test_uct_xgvmi_CFLAGS = $(BASE_CFLAGS) +test_uct_xgvmi_LDADD = $(top_builddir)/src/ucp/libuct.la + + if HAVE_CUDA noinst_PROGRAMS += test_cuda_hook_dynamic diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index b29b16d2fc5..2c08cefdc81 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -753,7 +753,9 @@ UCS_TEST_SKIP_COND_P(test_md, dereg_bad_arg, EXPECT_UCS_OK(status); free(ptr); } - +extern "C" { +#include +} // TODO check MD cap flag UCS_TEST_P(test_md, shared_rkey) { @@ -765,10 +767,12 @@ UCS_TEST_P(test_md, shared_rkey) int ret = ucs_posix_memalign(&ptr, ucs_get_page_size(), size, "shared_buf"); ASSERT_EQ(0, ret); + int vhca_id = ((uct_ib_md_t*)md())->vhca_id; + uct_md_mem_reg_shared_params_t reg_shared_params; reg_shared_params.address = ptr; reg_shared_params.length = size; - reg_shared_params.dest_gvmi = 0; + reg_shared_params.dest_gvmi = vhca_id; status = uct_md_mem_reg_shared(md(), ®_shared_params, &memh); ASSERT_UCS_OK(status); @@ -789,7 +793,7 @@ UCS_TEST_P(test_md, shared_rkey) uct_md_import_shared_rkey_params_t import_params; import_params.rkey = rkey_bundle.rkey; - import_params.source_gvmi = 0; // TODO + import_params.source_gvmi = vhca_id; // TODO uct_mem_h imported_memh; status = uct_md_import_shared_rkey(md(), &import_params, &imported_memh); From 6525191f2053178762959115a8611046f725ca60 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 19:20:10 +0200 Subject: [PATCH 07/27] UCP: Cleanup using UCT API --- src/ucp/api/ucp.h | 2 +- src/ucp/core/ucp_mm.c | 44 ++++++++++++++++++++++++++---------- src/ucp/core/ucp_mm.h | 2 +- src/ucp/core/ucp_mm.inl | 15 ++++-------- src/ucp/dt/datatype_iter.c | 2 +- src/ucp/dt/datatype_iter.inl | 3 ++- 6 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index 87a5a8f95b6..a8913ff0d85 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -1592,7 +1592,7 @@ typedef struct ucp_mem_map_params { ucs_memory_type_t memory_type; /* Id of the peer to create a shared memh for */ - uint32_t peer_id; + uint8_t peer_id; ucp_rkey_h rkey; } ucp_mem_map_params_t; diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 1eed06c3158..aadb4c31439 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -344,7 +344,8 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map, void *address, size_t length, unsigned uct_flags) { - ucp_md_map_t md_map_registered = 0; + ucp_md_map_t md_map_registered = 0; + uct_md_mem_reg_shared_params_t reg_shared_params = {}; uct_md_attr_t *md_attr; ucs_log_level_t log_level; ucp_md_index_t md_index; @@ -354,10 +355,15 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, md_attr = &context->tl_mds[md_index].attr; if (uct_flags & UCT_MD_MEM_FLAG_SHARED_MEMH) { + ucs_assert_always(memh->peer_id != UCP_NULL_RESOURCE); + reg_shared_params.address = address; + reg_shared_params.length = length; + reg_shared_params.dest_gvmi = memh->peer_id; status = uct_md_mem_reg_shared(context->tl_mds[md_index].md, - address, length, uct_flags, peer_id, + ®_shared_params, &memh->uct[md_index]); } else { + ucs_assert_always(memh->peer_id == UCP_NULL_RESOURCE); status = uct_md_mem_reg(context->tl_mds[md_index].md, address, length, uct_flags, &memh->uct[md_index]); @@ -383,7 +389,7 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, ucs_status_t ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, ucs_memory_type_t mem_type, ucp_md_map_t reg_md_map, - unsigned uct_flags, ucp_mem_h *memh_p) + unsigned uct_flags, ucp_rsc_index_t peer_id, ucp_mem_h *memh_p) { ucs_rcache_region_t *rregion; void *reg_address; @@ -426,6 +432,7 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, } memh->mem_type = mem_type; + memh->peer_id = peer_id; status = ucp_memh_register(context, memh, ~memh->md_map & reg_md_map, reg_address, reg_length, @@ -440,7 +447,8 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, static ucs_status_t ucp_memh_alloc(ucp_context_h context, void *address, size_t length, ucs_memory_type_t memory_type, unsigned uct_flags, - const char *alloc_name, ucp_rkey_h rkey, ucp_mem_h *memh_p) + const char *alloc_name, ucp_rkey_h rkey, ucp_rsc_index_t peer_id, + ucp_mem_h *memh_p) { ucp_md_map_t reg_md_map = context->reg_md_map[memory_type]; ucp_md_index_t alloc_md_index = UCP_NULL_RESOURCE; @@ -469,7 +477,8 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, if (rkey == NULL) { status = ucp_memh_get_slow(context, mem.address, mem.length, - mem.mem_type, reg_md_map, uct_flags, &memh); + mem.mem_type, reg_md_map, uct_flags, + peer_id, &memh); } else { status = ucp_memh_import(context, rkey, address, length, memh); } @@ -498,28 +507,38 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, } static ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, - address, length, ucp_memh_h memh) + address, length, ucp_memh_h *memh_p) { ucp_md_map_t md_map_registered = 0; + uct_md_import_shared_rkey_params_t import_params = {}; + ucp_mem_h memh; uct_md_attr_t *md_attr; ucp_md_index_t md_index; ucs_status_t status; ucs_assert_always(rkey->peer_id != UCP_NULL_RESOURCE); + memh = ucs_malloc(sizeof(*memh) + context->num_mds * sizeof(memh->uct[0]), + "ucp_import_memh"); + if (memh == NULL) { + return UCS_ERR_NO_MEMORY; + } + ucs_for_each_bit(md_index, rkey->md_map) { md_attr = &context->tl_mds[md_index].attr; ucs_assert_always(md_attr->cap.flags & UCT_MD_FLAG_SHARED_MEMH); + import_params.rkey = rkey->tl_rkey[md_index]; + import_params.source_gvmi = rkey->peer_id; - status = uct_md_memh_import(context->tl_mds[md_index].md, - address, length, rkey->peer_id, - rkey->tl_rkey[md_index], - &memh->uct[md_index]); + status = uct_md_import_shared_rkey(context->tl_mds[md_index].md, + &import_params, + &memh->uct[md_index]); if (ucs_unlikely(status != UCS_OK)) { ucp_memh_cleanup(context, memh, md_map_registered, address, length, md_index, 0, status); + ucs_free(memh); return status; } @@ -531,6 +550,7 @@ static ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, } memh->md_map |= md_map_registered; + *memh_p = memh; return UCS_OK; } @@ -631,12 +651,12 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para if (ucp_mem_map_is_allocate(params)) { status = ucp_memh_alloc(context, address, params->length, memory_type, ucp_mem_map_params2uct_flags(params), - "user memory", rkey, memh_p); + "user memory", rkey, params->peer_id, memh_p); } else { status = ucp_memh_get(context, address, params->length, memory_type, context->reg_md_map[memory_type], ucp_mem_map_params2uct_flags(params), - rkey, memh_p); + rkey, params->peer_id, memh_p); } out: diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index dfe81b29bb5..346ed9d795f 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -133,7 +133,7 @@ void ucp_mem_type_unreg_buffers(ucp_worker_h worker, ucs_memory_type_t mem_type, ucs_status_t ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, ucs_memory_type_t mem_type, ucp_md_map_t reg_md_map, unsigned uct_flags, - ucp_mem_h *memh_p); + ucp_rsc_index_t peer_id, ucp_mem_h *memh_p); void ucp_memh_dereg(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map); diff --git a/src/ucp/core/ucp_mm.inl b/src/ucp/core/ucp_mm.inl index 2ad8cbb61ae..09194063b78 100644 --- a/src/ucp/core/ucp_mm.inl +++ b/src/ucp/core/ucp_mm.inl @@ -19,7 +19,8 @@ ucp_memh_is_zero_length(const ucp_mem_h memh) static UCS_F_ALWAYS_INLINE ucs_status_t ucp_memh_get(ucp_context_h context, void *address, size_t length, ucs_memory_type_t mem_type, ucp_md_map_t reg_md_map, - unsigned uct_flags, ucp_rkey_h rkey, ucp_mem_h *memh_p) + unsigned uct_flags, ucp_rkey_h rkey, ucp_rsc_index_t peer_id, + ucp_mem_h *memh_p) { ucs_rcache_region_t *rregion; ucs_status_t status; @@ -32,18 +33,10 @@ ucp_memh_get(ucp_context_h context, void *address, size_t length, if (rkey_buffer != NULL) { /* Cache is not supported for shared mkeys */ - memh = ucs_malloc(sizeof(*memh) + context->num_mds * sizeof(memh->uct[0]), - "ucp_import_memh"); - if (memh == NULL) { - return UCS_ERR_NO_MEMORY; - } - - status = ucp_memh_import(context, rkey, address, length, memh); + status = ucp_memh_import(context, rkey, address, length, memh_p); if (status != UCS_OK) { - ucs_free(memh); return status; } - *memh_p = memh; return UCS_OK; } @@ -65,7 +58,7 @@ ucp_memh_get(ucp_context_h context, void *address, size_t length, } return ucp_memh_get_slow(context, address, length, mem_type, reg_md_map, - uct_flags, memh_p); + uct_flags, peer_id, memh_p); } static UCS_F_ALWAYS_INLINE void diff --git a/src/ucp/dt/datatype_iter.c b/src/ucp/dt/datatype_iter.c index dcb89fc60cc..f74e2ef2ffd 100644 --- a/src/ucp/dt/datatype_iter.c +++ b/src/ucp/dt/datatype_iter.c @@ -55,7 +55,7 @@ ucs_status_t ucp_datatype_iter_iov_mem_reg(ucp_context_h context, iov = ucp_datatype_iter_iov_at(dt_iter, iov_index); status = ucp_memh_get(context, iov->buffer, iov->length, dt_iter->mem_info.type, md_map, uct_flags, - NULL, &iov_memh[iov_index]); + NULL, UCP_NULL_RESOURCE, &iov_memh[iov_index]); if (status != UCS_OK) { ucp_datatype_iter_iov_mem_dereg(context, dt_iter); return status; diff --git a/src/ucp/dt/datatype_iter.inl b/src/ucp/dt/datatype_iter.inl index 94171bcdc89..77b1c132725 100644 --- a/src/ucp/dt/datatype_iter.inl +++ b/src/ucp/dt/datatype_iter.inl @@ -501,7 +501,8 @@ ucp_datatype_iter_mem_reg(ucp_context_h context, ucp_datatype_iter_t *dt_iter, return ucp_memh_get(context, dt_iter->type.contig.buffer, dt_iter->length, (ucs_memory_type_t)dt_iter->mem_info.type, md_map, - uct_flags, NULL, &dt_iter->type.contig.memh); + uct_flags, NULL, UCP_NULL_RESOURCE, + &dt_iter->type.contig.memh); } else if (ucp_datatype_iter_is_class(dt_iter, UCP_DATATYPE_IOV, dt_mask)) { return ucp_datatype_iter_iov_mem_reg(context, dt_iter, md_map, uct_flags); } else { From 41ac42390eb2d1b02853adec3e0e32daaa587235 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 19:42:19 +0200 Subject: [PATCH 08/27] WIP4 --- test/apps/Makefile.am | 5 +- test/apps/test_uct_xgvmi.c | 205 +++++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 test/apps/test_uct_xgvmi.c diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am index d9c38d8e39e..0e0b8f1f1be 100644 --- a/test/apps/Makefile.am +++ b/test/apps/Makefile.am @@ -21,7 +21,8 @@ noinst_PROGRAMS = \ test_link_map \ test_dlopen_cfg_print \ test_init_mt \ - test_memtrack_limit + test_memtrack_limit \ + test_uct_xgvmi objdir = $(shell sed -n -e 's/^objdir=\(.*\)$$/\1/p' $(LIBTOOL)) @@ -68,7 +69,7 @@ test_init_mt_LDADD = $(top_builddir)/src/ucp/libucp.la test_uct_xgvmi_SOURCES = test_uct_xgvmi.c test_uct_xgvmi_CPPFLAGS = $(BASE_CPPFLAGS) test_uct_xgvmi_CFLAGS = $(BASE_CFLAGS) -test_uct_xgvmi_LDADD = $(top_builddir)/src/ucp/libuct.la +test_uct_xgvmi_LDADD = $(top_builddir)/src/uct/libuct.la if HAVE_CUDA diff --git a/test/apps/test_uct_xgvmi.c b/test/apps/test_uct_xgvmi.c new file mode 100644 index 00000000000..6a864a3bd08 --- /dev/null +++ b/test/apps/test_uct_xgvmi.c @@ -0,0 +1,205 @@ +/** + * Copyright (C) 2022 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include +#include +#include +#include +#include +#include +#include + + +#define CHKERR_ACTION(_cond, _msg, _action) \ + do { \ + if (_cond) { \ + fprintf(stderr, "Failed to %s\n", _msg); \ + _action; \ + } \ + } while (0) + + +#define CHKERR_JUMP(_cond, _msg, _label) CHKERR_ACTION(_cond, _msg, goto _label) + +typedef struct { + char *server_name; + uint16_t server_port; + const char *md_name; + int gvmi_id; + size_t size; + uint32_t mkey; +} cmd_args_t; + + +/* Device and transport to be used are determined by minimum latency */ +static ucs_status_t open_md(const cmd_args_t *cmd_args, uct_md_h *md_p, + uct_component_h *component_p) +{ + uct_component_h *components; + unsigned num_components; + unsigned cmpt_index; + uct_component_attr_t component_attr; + unsigned md_index; + uct_md_config_t *md_config; + ucs_status_t status; + uct_md_h md; + + status = uct_query_components(&components, &num_components); + CHKERR_JUMP(UCS_OK != status, "query for components", error_ret); + + for (cmpt_index = 0; cmpt_index < num_components; ++cmpt_index) { + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT; + status = uct_component_query(components[cmpt_index], &component_attr); + CHKERR_JUMP(UCS_OK != status, "query component attributes", + release_component_list); + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + component_attr.md_resources = alloca(sizeof(*component_attr.md_resources) * + component_attr.md_resource_count); + status = uct_component_query(components[cmpt_index], &component_attr); + CHKERR_JUMP(UCS_OK != status, "query for memory domain resources", + release_component_list); + + /* Iterate through memory domain resources */ + for (md_index = 0; md_index < component_attr.md_resource_count; ++md_index) { + status = uct_md_config_read(components[cmpt_index], NULL, NULL, + &md_config); + CHKERR_JUMP(UCS_OK != status, "read MD config", + release_component_list); + + if (strcmp(component_attr.md_resources[md_index].md_name, + cmd_args->md_name)) { + continue; + } + + status = uct_md_open(components[cmpt_index], + component_attr.md_resources[md_index].md_name, + md_config, &md); + uct_config_release(md_config); + + CHKERR_JUMP(UCS_OK != status, "open memory domains", + release_component_list); + + *md_p = md; + *component_p = components[cmpt_index]; + return UCS_OK; + } + } + + status = UCS_ERR_NO_DEVICE; + +release_component_list: + uct_release_component_list(components); +error_ret: + return status; +} + +void do_export(uct_md_h md, uct_component_h component, + const cmd_args_t *cmd_args) +{ + uct_md_mem_reg_shared_params_t reg_shared_params; + uct_rkey_bundle_t rkey_bundle; + uint8_t rkey_buf[1024]; + ucs_status_t status; + uct_mem_h memh; + void *ptr; + int ret; + + ret = posix_memalign(&ptr, 65536, cmd_args->size); + CHKERR_JUMP(0 != ret, "allocate memory", error_ret); + + reg_shared_params.address = ptr; + reg_shared_params.length = cmd_args->size; + reg_shared_params.dest_gvmi = cmd_args->gvmi_id; + + status = uct_md_mem_reg_shared(md, ®_shared_params, &memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_mem_reg_shared", error_ret); + + status = uct_md_mkey_pack(md, memh, rkey_buf); + CHKERR_JUMP(UCS_OK != status, "uct_md_mkey_pack", error_ret); + + status = uct_rkey_unpack(component, rkey_buf, &rkey_bundle); + CHKERR_JUMP(UCS_OK != status, "uct_rkey_unpack", error_ret); + + printf("shared rkey 0x%x for gvmi %d\n", (uint32_t)rkey_bundle.rkey, + cmd_args->gvmi_id); + printf("press any key to continue\n"); + getchar(); + +error_ret: + ; +} + +void do_import(uct_md_h md, uct_component_h component, + const cmd_args_t *cmd_args) +{ + uct_md_import_shared_rkey_params_t import_params; + ucs_status_t status; + uct_mem_h memh; + + printf("unpacking mkey 0x%x on gvmi %d\n", cmd_args->mkey, + cmd_args->gvmi_id); + + import_params.rkey = cmd_args->mkey; + import_params.source_gvmi = cmd_args->gvmi_id; // TODO + + status = uct_md_import_shared_rkey(md, &import_params, &memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_import_shared_rkey", error_ret); + + printf("imported shared rkey memh=%p\n", memh); + +error_ret: + ; +} + + +int main(int argc, char** argv) +{ + uct_component_h component; + ucs_status_t status; + uct_md_h md; + cmd_args_t args; + int c; + + args.md_name = "mlx5_0"; + args.gvmi_id = 0; + args.mkey = 0; + args.size = 1024 * 1024; + + while ((c = getopt(argc, argv, "d:g:i:")) != -1) { + switch (c) { + case 'd': + args.md_name = optarg; + break; + case 'g': + args.gvmi_id = atoi(optarg); + break; + case 'i': + args.mkey = strtol(optarg, NULL, 0); + break; + default: + printf("Usage: %s [-d ] [-g ] [-i ]\n", + argv[0]); + return -1; + } + } + + status = open_md(&args, &md, &component); + if (status != UCS_OK) { + printf("could not open md\n"); + return -2; + } + + if (args.mkey) { + do_import(md, component, &args); + } else { + do_export(md, component, &args); + } + + uct_md_close(md); + + return 0; +} \ No newline at end of file From 1c08b9af9136e67d275d10e45d9627bf2f872f49 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 20:20:59 +0200 Subject: [PATCH 09/27] UCP: compilation fixes --- src/ucp/core/ucp_mm.c | 31 +++++++++++------------ src/ucp/core/ucp_mm.h | 3 +++ src/ucp/core/ucp_mm.inl | 2 +- src/ucp/core/ucp_rkey.h | 3 ++- test/gtest/ucp/test_ucp_am.cc | 47 +++++++++++++++++++---------------- 5 files changed, 47 insertions(+), 39 deletions(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index aadb4c31439..40e41a0e16a 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -274,7 +274,7 @@ ucp_mem_map_params2uct_flags(const ucp_mem_map_params_t *params) } if (params->flags & UCP_MEM_MAP_SHARED) { - flags |= UCT_MD_MEM_FLAG_SHARED_MEMH; + flags |= UCT_MD_FLAG_SHARED_RKEY; } } @@ -316,7 +316,7 @@ static void ucp_memh_cleanup(ucp_context_h context, ucp_mem_h memh, size_t length, ucp_md_index_t md_index, unsigned uct_flags, ucs_status_t status) { - int shared_memh = uct_flags & UCT_MD_MEM_FLAG_SHARED_MEMH; + int shared_memh = uct_flags & UCT_MD_FLAG_SHARED_RKEY; ucs_log_level_t log_level; log_level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? @@ -335,9 +335,6 @@ static void ucp_memh_cleanup(ucp_context_h context, ucp_mem_h memh, } else { ucs_free(memh); } - - return status; - } static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, @@ -347,15 +344,18 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map_registered = 0; uct_md_mem_reg_shared_params_t reg_shared_params = {}; uct_md_attr_t *md_attr; - ucs_log_level_t log_level; ucp_md_index_t md_index; ucs_status_t status; ucs_for_each_bit(md_index, md_map) { md_attr = &context->tl_mds[md_index].attr; - if (uct_flags & UCT_MD_MEM_FLAG_SHARED_MEMH) { + if (uct_flags & UCT_MD_FLAG_SHARED_RKEY) { ucs_assert_always(memh->peer_id != UCP_NULL_RESOURCE); + + if (!(md_attr->cap.flags & UCT_MD_FLAG_SHARED_RKEY)) { + continue; + } reg_shared_params.address = address; reg_shared_params.length = length; reg_shared_params.dest_gvmi = memh->peer_id; @@ -418,7 +418,6 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, memh->super.super.end = (uintptr_t)reg_address + reg_length; memh->alloc_md_index = UCP_NULL_RESOURCE; memh->alloc_method = UCT_ALLOC_METHOD_LAST; - memh->flags = 0; } else { status = ucs_rcache_get(context->rcache, reg_address, reg_length, PROT_READ|PROT_WRITE, NULL, &rregion); @@ -480,7 +479,7 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, mem.mem_type, reg_md_map, uct_flags, peer_id, &memh); } else { - status = ucp_memh_import(context, rkey, address, length, memh); + status = ucp_memh_import(context, rkey, address, length, &memh); } if (status != UCS_OK) { @@ -506,8 +505,8 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, return status; } -static ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, - address, length, ucp_memh_h *memh_p) +ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, + void *address, size_t length, ucp_mem_h *memh_p) { ucp_md_map_t md_map_registered = 0; uct_md_import_shared_rkey_params_t import_params = {}; @@ -527,8 +526,8 @@ static ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, ucs_for_each_bit(md_index, rkey->md_map) { md_attr = &context->tl_mds[md_index].attr; - ucs_assert_always(md_attr->cap.flags & UCT_MD_FLAG_SHARED_MEMH); - import_params.rkey = rkey->tl_rkey[md_index]; + ucs_assert_always(md_attr->cap.flags & UCT_MD_FLAG_SHARED_RKEY); + import_params.rkey = rkey->tl_rkey[md_index].rkey.rkey; import_params.source_gvmi = rkey->peer_id; status = uct_md_import_shared_rkey(context->tl_mds[md_index].md, @@ -842,7 +841,7 @@ ucp_mpool_malloc(ucp_worker_h worker, ucs_mpool_t *mp, size_t *size_p, void **ch status = ucp_memh_alloc(worker->context, NULL, *size_p + sizeof(*chunk_hdr), UCS_MEMORY_TYPE_HOST, ucp_mem_map_params2uct_flags(&mem_params), - ucs_mpool_name(mp), NULL, &memh); + ucs_mpool_name(mp), NULL, UCP_NULL_RESOURCE, &memh); if (status != UCS_OK) { goto out; } @@ -894,7 +893,7 @@ ucp_rndv_frag_malloc_mpools(ucs_mpool_t *mp, size_t *size_p, void **chunk_p) /* payload; need to get default flags from ucp_mem_map_params2uct_flags() */ status = ucp_memh_alloc(context, NULL, frag_size * num_elems, mem_type, UCT_MD_MEM_ACCESS_RMA, ucs_mpool_name(mp), NULL, - &chunk_hdr->memh); + UCP_NULL_RESOURCE, &chunk_hdr->memh); if (status != UCS_OK) { return status; } @@ -967,7 +966,7 @@ ucp_mm_get_alloc_md_map(ucp_context_h context, ucp_md_map_t *md_map_p) /* Allocate dummy 1-byte buffer to get the expected md_map */ status = ucp_memh_alloc(context, NULL, 1, UCS_MEMORY_TYPE_HOST, UCT_MD_MEM_ACCESS_ALL, "get_alloc_md_map", - NULL, &memh); + NULL, UCP_NULL_RESOURCE, &memh); if (status != UCS_OK) { goto out; } diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index 346ed9d795f..aa4f6b48938 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -137,6 +137,9 @@ ucs_status_t ucp_memh_get_slow(ucp_context_h context, void *address, void ucp_memh_dereg(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map); +ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, + void *address, size_t length, ucp_mem_h *memh_p); + ucs_status_t ucp_mem_rcache_init(ucp_context_h context); void ucp_mem_rcache_cleanup(ucp_context_h context); diff --git a/src/ucp/core/ucp_mm.inl b/src/ucp/core/ucp_mm.inl index 09194063b78..0c4443b1ca9 100644 --- a/src/ucp/core/ucp_mm.inl +++ b/src/ucp/core/ucp_mm.inl @@ -31,7 +31,7 @@ ucp_memh_get(ucp_context_h context, void *address, size_t length, return UCS_OK; } - if (rkey_buffer != NULL) { + if (rkey != NULL) { /* Cache is not supported for shared mkeys */ status = ucp_memh_import(context, rkey, address, length, memh_p); if (status != UCS_OK) { diff --git a/src/ucp/core/ucp_rkey.h b/src/ucp/core/ucp_rkey.h index 9d5d8c6d540..ee3ab5cda03 100644 --- a/src/ucp/core/ucp_rkey.h +++ b/src/ucp/core/ucp_rkey.h @@ -183,7 +183,8 @@ ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map, const uct_mem_h *memh, const ucp_memory_info_t *mem_info, ucp_sys_dev_map_t sys_dev_map, - const ucs_sys_dev_distance_t *sys_distance, void *buffer); + const ucs_sys_dev_distance_t *sys_distance, void *buffer, + ucp_rsc_index_t peer_id); ssize_t diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index 6ddd6a0a6a9..93a9fc3cddc 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -1283,9 +1283,10 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { return UCS_INPROGRESS; } - ucp_mem_h alloc_memhs(size_t length, ucp_mem_h *exp_memh, ucp_mem_h *imp_memh) + void* alloc_memhs(ucp_context_h context, size_t length, + ucp_mem_h *exp_memh, ucp_mem_h *imp_memh) { - ucp_memh memh; + ucp_mem_h memh; ucp_mem_map_params_t mparams; mparams.field_mask = UCP_MEM_MAP_PARAM_FIELD_LENGTH | UCP_MEM_MAP_PARAM_FIELD_FLAGS; @@ -1299,19 +1300,20 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; ucs_status_t status = ucp_mem_query(memh, &attr); if (status != UCS_OK) { - ucp_mem_unmap(sender().ucph(), *memh_p); - ASSERT_TRUE(false); + ucp_mem_unmap(context, memh); + return NULL; } - ASSERT_GE(attr.length, length); + EXPECT_GE(attr.length, length); *exp_memh = memh; // Pack and unpack the key emulating that it is traversing thru the network void *rkey_buf; size_t rkey_buf_size; - ASSERT_UCS_OK(ucp_rkey_pack(sender().ucph(), memh, &rkey_buf, + ASSERT_UCS_OK(ucp_rkey_pack(context, memh, &rkey_buf, &rkey_buf_size)); - ASSERT_UCS_OK(ucp_rkey_unpack(sender().ep(), rkey_buf, &mparams.rkey)); + ASSERT_UCS_OK(ucp_ep_rkey_unpack(sender().ep(), rkey_buf, + &mparams.rkey)); ucp_rkey_buffer_release(rkey_buf); @@ -1320,7 +1322,7 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { UCP_MEM_MAP_PARAM_FIELD_RKEY; mparams.address = attr.address; mparams.length = attr.length; - ASSERT_UCS_OK(ucp_mem_map(sender().ucph(), &mparams, &memh)); + ASSERT_UCS_OK(ucp_mem_map(context, &mparams, &memh)); // Should be safe to destroy rkey now ucp_rkey_destroy(mparams.rkey); @@ -1389,7 +1391,8 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { EXPECT_UCS_OK(status); self->m_am_received = true; - free_memhs(receiver().ucp(), m_rx_memh, m_imp_memh); + self->free_memhs(self->receiver().ucph(), + self->m_rx_memh, self->m_imp_memh); } static ucs_status_t am_data_rx_shared_mkey_rndv_cb( @@ -1401,17 +1404,18 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { EXPECT_FALSE(self->m_am_received); - void *address = allocate_imported_mkey(length, &m_rx_memh, &m_imp_memh); - - ucp_request_param_t param; - param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH | - UCP_OP_ATTR_FIELD_CALLBACK | - UCP_OP_ATTR_FLAG_NO_IMM_CMPL; - param.memh = memh; - params.cb.recv_am = am_data_recv_cb; - ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(receiver().worker(), - data_desc, - address, length, ¶m); + void *address = self->alloc_memhs(self->receiver().ucph(), length, + &self->m_rx_memh, &self->m_imp_memh); + + ucp_request_param_t op_param; + op_param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH | + UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL; + op_param.memh = self->m_imp_memh; + op_param.cb.recv_am = am_data_recv_cb; + ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(self->receiver().worker(), + data, address, length, + &op_param); ucp_request_release(rptr); return UCS_INPROGRESS; @@ -1528,7 +1532,8 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, shared_mkey) ucp_mem_h exp_memh, imp_memh; m_am_received = false; size_t length = 512 * UCS_KBYTE; - void *address = alloc_memhs(length, &exp_memh, &imp_memh); + void *address = alloc_memhs(sender().ucph(), length, &exp_memh, &imp_memh); + ASSERT_TRUE(address != NULL); ucp_request_param_t param; param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH; From 9cd1fa4fd2d3ccafd169419b8dae4fd7fcb3efd9 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 20:21:15 +0200 Subject: [PATCH 10/27] UCT: Cross-gvmi support --- src/uct/api/uct.h | 3 +- src/uct/api/v2/uct_v2.h | 21 +++++ src/uct/base/uct_md.c | 15 +++ src/uct/base/uct_md.h | 11 +++ src/uct/ib/base/ib_md.c | 53 ++++++++++- src/uct/ib/base/ib_md.h | 51 ++++++++++ src/uct/ib/mlx5/dv/ib_mlx5_ifc.h | 42 +++++++-- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 152 ++++++++++++++++++++++++++++++ test/apps/Makefile.am | 10 +- test/gtest/uct/test_md.cc | 48 ++++++++++ 10 files changed, 393 insertions(+), 13 deletions(-) diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index b7cafa46115..509f0bba3cd 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -692,7 +692,8 @@ enum { UCT_MD_FLAG_SOCKADDR = UCS_BIT(7), /**< MD support for client-server connection establishment via sockaddr */ - UCT_MD_FLAG_INVALIDATE = UCS_BIT(8) /**< MD supports memory invalidation */ + UCT_MD_FLAG_INVALIDATE = UCS_BIT(8), /**< MD supports memory invalidation */ + UCT_MD_FLAG_SHARED_RKEY = UCS_BIT(9) /**< MD supports shared remote keys */ }; /** diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h index 918b7115c1d..0ece1f19b8b 100644 --- a/src/uct/api/v2/uct_v2.h +++ b/src/uct/api/v2/uct_v2.h @@ -412,6 +412,27 @@ ucs_status_t uct_ep_query(uct_ep_h ep, uct_ep_attr_t *ep_attr); int uct_iface_is_reachable_v2(uct_iface_h iface, const uct_iface_is_reachable_params_t *params); + +typedef struct { + void *address; + size_t length; + int dest_gvmi; +} uct_md_mem_reg_shared_params_t; + +ucs_status_t uct_md_mem_reg_shared(uct_md_h md, + uct_md_mem_reg_shared_params_t *params, + uct_mem_h *memh_p); + +typedef struct { + int source_gvmi; + uct_rkey_t rkey; +} uct_md_import_shared_rkey_params_t; + +ucs_status_t +uct_md_import_shared_rkey(uct_md_h md, + uct_md_import_shared_rkey_params_t *params, + uct_mem_h *memh_p); + END_C_DECLS #endif diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 3c1f5a6afb8..4e8ec1c45ba 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -468,6 +468,21 @@ ucs_status_t uct_md_mem_dereg_v2(uct_md_h md, return md->ops->mem_dereg(md, params); } +ucs_status_t uct_md_mem_reg_shared(uct_md_h md, + uct_md_mem_reg_shared_params_t *params, + uct_mem_h *memh_p) +{ + return md->ops->mem_reg_shared(md, params, memh_p); +} + +ucs_status_t +uct_md_import_shared_rkey(uct_md_h md, + uct_md_import_shared_rkey_params_t *params, + uct_mem_h *memh_p) +{ + return md->ops->import_shared_rkey(md, params, memh_p); +} + ucs_status_t uct_md_mem_query(uct_md_h md, const void *address, size_t length, uct_md_mem_attr_t *mem_attr) { diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h index 78276acb702..6af66192581 100644 --- a/src/uct/base/uct_md.h +++ b/src/uct/base/uct_md.h @@ -100,6 +100,15 @@ typedef ucs_status_t (*uct_md_mem_dereg_func_t)(uct_md_h md, const uct_md_mem_dereg_params_t *param); + +typedef ucs_status_t (*uct_md_mem_reg_shared_func_t)( + uct_md_h md, uct_md_mem_reg_shared_params_t *params, uct_mem_h *memh_p); + + +typedef ucs_status_t (*uct_md_import_shared_rkey_func_t)( + uct_md_h md, uct_md_import_shared_rkey_params_t *params, uct_mem_h *memh_p); + + typedef ucs_status_t (*uct_md_mem_query_func_t)(uct_md_h md, const void *address, size_t length, @@ -129,6 +138,8 @@ struct uct_md_ops { uct_md_mem_advise_func_t mem_advise; uct_md_mem_reg_func_t mem_reg; uct_md_mem_dereg_func_t mem_dereg; + uct_md_mem_reg_shared_func_t mem_reg_shared; + uct_md_import_shared_rkey_func_t import_shared_rkey; uct_md_mem_query_func_t mem_query; uct_md_mkey_pack_func_t mkey_pack; uct_md_is_sockaddr_accessible_func_t is_sockaddr_accessible; diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index d377f8aee20..9a24942ce95 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -298,7 +298,8 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY | UCT_MD_FLAG_ADVISE | - UCT_MD_FLAG_INVALIDATE; + UCT_MD_FLAG_INVALIDATE | + md->extra_cap_flags; md_attr->cap.alloc_mem_types = 0; md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); md_attr->cap.detect_mem_types = 0; @@ -845,6 +846,50 @@ static ucs_status_t uct_ib_mem_dereg(uct_md_h uct_md, return status; } +static ucs_status_t +uct_ib_md_mem_reg_shared(uct_md_h uct_md, uct_md_mem_reg_shared_params_t *params, + uct_mem_h *memh_p) +{ + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *ib_memh; + ucs_status_t status; + + // return md->ops->mem_reg_shared(md, params, memh_p); + ib_memh = uct_ib_memh_alloc(md); + status = md->ops->reg_crossed_key(md, params->address, params->length, + params->dest_gvmi, ib_memh); + if (status != UCS_OK) { + uct_ib_memh_free(ib_memh); + return status; + } + + *memh_p = ib_memh; + return UCS_OK; +} + +static ucs_status_t +uct_ib_md_import_shared_rkey(uct_md_h uct_md, + uct_md_import_shared_rkey_params_t *params, + uct_mem_h *memh_p) +{ + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *ib_memh; + ucs_status_t status; + + // return md->ops->import_shared_rkey(md, params, memh_p); + ib_memh = uct_ib_memh_alloc(md); + status = md->ops->reg_crossing_key(md, NULL, 0, params->source_gvmi, + uct_ib_md_direct_rkey(params->rkey), + ib_memh); + if (status != UCS_OK) { + uct_ib_memh_free(ib_memh); + return status; + } + + *memh_p = ib_memh; + return UCS_OK; +} + static ucs_status_t uct_ib_verbs_reg_key(uct_ib_md_t *md, void *address, size_t length, uint64_t access_flags, uct_ib_mem_t *ib_memh, @@ -970,6 +1015,8 @@ static uct_md_ops_t uct_ib_md_ops = { .query = uct_ib_md_query, .mem_reg = uct_ib_mem_reg, .mem_dereg = uct_ib_mem_dereg, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .detect_memory_type = ucs_empty_function_return_unsupported, @@ -1041,6 +1088,8 @@ static uct_md_ops_t uct_ib_md_rcache_ops = { .query = uct_ib_md_query, .mem_reg = uct_ib_mem_rcache_reg, .mem_dereg = uct_ib_mem_rcache_dereg, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .is_sockaddr_accessible = ucs_empty_function_return_zero_int, @@ -1155,6 +1204,8 @@ static uct_md_ops_t UCS_V_UNUSED uct_ib_md_global_odp_ops = { .query = uct_ib_md_odp_query, .mem_reg = uct_ib_mem_global_odp_reg, .mem_dereg = uct_ib_mem_global_odp_dereg, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .detect_memory_type = ucs_empty_function_return_unsupported, diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index dd8e8e399c6..6fd7c109027 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -136,6 +136,8 @@ typedef struct uct_ib_md { int fork_init; size_t memh_struct_size; uint64_t reg_mem_types; + uint64_t extra_cap_flags; + int vhca_id; } uct_ib_md_t; @@ -320,6 +322,53 @@ typedef ucs_status_t (*uct_ib_md_mem_prefetch_func_t)(uct_ib_md_t *md, typedef ucs_status_t (*uct_ib_md_get_atomic_mr_id_func_t)(uct_ib_md_t *md, uint8_t *mr_id); +/** + * Memory domain method to register crossed mkey for memory area. + * + * @param [in] ib_md Memory domain. + * + * @param [in] address Memory area start address (HOST). + * + * @param [in] length Memory area length (HOST). + * + * @param [in] allowed_gvmi_id Allowed GVMI ID (DPU). + * + * @param [out] ib_memh Memory region handle. + * Method should initialize lkey & rkey. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_crossed_key_func_t)(uct_ib_md_t *ib_md, + void *address, + size_t length, + uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh); + +/** + * Memory domain method to register crossing mkey for memory area. + * + * @param [in] ib_md Memory domain. + * + * @param [in] address Memory area start address (HOST). + * + * @param [in] length Memory area length (HOST). + * + * @param [in] target_gvmi_id Target GVMI ID (HOST). + * + * @param [in] target_mkey Target mkey this mkey refers to (HOST). + * + * @param [out] ib_memh Memory region handle. + * Method should initialize lkey and rkey. + * + * @return UCS_OK on success or error code in case of failure. + */ +typedef ucs_status_t (*uct_ib_md_reg_crossing_key_func_t)(uct_ib_md_t *ib_md, + void *address, + size_t length, + uint32_t target_gvmi_id, + uint32_t target_mkey, + uct_ib_mem_t *ib_memh); + typedef struct uct_ib_md_ops { uct_ib_md_open_func_t open; uct_ib_md_cleanup_func_t cleanup; @@ -331,6 +380,8 @@ typedef struct uct_ib_md_ops { uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded; uct_ib_md_mem_prefetch_func_t mem_prefetch; uct_ib_md_get_atomic_mr_id_func_t get_atomic_mr_id; + uct_ib_md_reg_crossed_key_func_t reg_crossed_key; + uct_ib_md_reg_crossing_key_func_t reg_crossing_key; } uct_ib_md_ops_t; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h index 452c637bddc..d290ee525dd 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h +++ b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h @@ -384,9 +384,22 @@ struct uct_ib_mlx5_cmd_hca_cap_bits { uint8_t reserved_at_500[0x20]; uint8_t num_of_uars_per_page[0x20]; - uint8_t reserved_at_540[0x40]; - uint8_t reserved_at_580[0x3d]; + uint8_t flex_parser_protocols[0x20]; + + uint8_t reserved_at_560[0x13]; + uint8_t log_max_guaranteed_connections[0x5]; + uint8_t reserved_at_578[0x3]; + uint8_t log_max_dct_connections[0x5]; + + uint8_t log_max_atomic_size_qp[0x8]; + uint8_t reserved_at_588[0x10]; + uint8_t log_max_atomic_size_dc[0x8]; + + uint8_t reserved_at_5a0[0x12]; + uint8_t crossing_vhca_mkey[0x1]; + uint8_t reserved_at_5b3[0x9]; + uint8_t mini_cqe_resp_stride_index[0x1]; uint8_t cqe_128_always[0x1]; uint8_t cqe_compression_128[0x1]; uint8_t cqe_compression[0x1]; @@ -624,11 +637,12 @@ struct uct_ib_mlx5_query_hca_vport_context_in_bits { }; enum { - UCT_IB_MLX5_MKC_ACCESS_MODE_PA = 0x0, - UCT_IB_MLX5_MKC_ACCESS_MODE_MTT = 0x1, - UCT_IB_MLX5_MKC_ACCESS_MODE_KLMS = 0x2, - UCT_IB_MLX5_MKC_ACCESS_MODE_KSM = 0x3, - UCT_IB_MLX5_MKC_ACCESS_MODE_MEMIC = 0x5 + UCT_IB_MLX5_MKC_ACCESS_MODE_PA = 0x0, + UCT_IB_MLX5_MKC_ACCESS_MODE_MTT = 0x1, + UCT_IB_MLX5_MKC_ACCESS_MODE_KLMS = 0x2, + UCT_IB_MLX5_MKC_ACCESS_MODE_KSM = 0x3, + UCT_IB_MLX5_MKC_ACCESS_MODE_MEMIC = 0x5, + UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA = 0x6 }; struct uct_ib_mlx5_mkc_bits { @@ -636,7 +650,9 @@ struct uct_ib_mlx5_mkc_bits { uint8_t free[0x1]; uint8_t reserved_at_2[0x1]; uint8_t access_mode_4_2[0x3]; - uint8_t reserved_at_6[0x7]; + uint8_t alter_pd_to_vhca_id[0x1]; + uint8_t crossed_side_mkey[0x1]; + uint8_t reserved_at_8[0x5]; uint8_t relaxed_ordering_write[0x1]; uint8_t reserved_at_e[0x1]; uint8_t small_fence_on_rdma_read_response[0x1]; @@ -669,9 +685,15 @@ struct uct_ib_mlx5_mkc_bits { uint8_t bsf_octword_size[0x20]; - uint8_t reserved_at_120[0x80]; + uint8_t reserved_at_120[0x60]; + + uint8_t crossing_target_gvmi_id[0x10]; + uint8_t reserved_at_190[0x10]; - uint8_t translations_octword_size[0x20]; + union { + uint8_t translations_octword_size[0x20]; + uint8_t crossing_target_mkey[0x20]; + }; uint8_t reserved_at_1c0[0x1b]; uint8_t log_entity_size[0x5]; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 46e7f949525..45bd126c840 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -744,6 +744,23 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, md->flags |= UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG; } + md->super.vhca_id = UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id); + // ucs_warn("%s: vhca_id is %d. at b4h: 0x%x; 0x%x", uct_ib_device_name(dev), + // UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id), + // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)), + // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)) & (1<<13) + // ); + + if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, + crossing_vhca_mkey)) { + ucs_info("%s: vhca_id=%d crossing_vhca_mkey is supported", + uct_ib_device_name(dev), md->super.vhca_id); + md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; + } else { + ucs_info("%s: vhca_id=%d crossing_vhca_mkey is not supported", + uct_ib_device_name(dev), md->super.vhca_id); + } + status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); if (status != UCS_OK) { goto err_free; @@ -844,6 +861,139 @@ static void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd) ucs_recursive_spinlock_destroy(&md->dbrec_lock); } +static ucs_status_t +uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; + struct mlx5dv_devx_umem *mem; + struct mlx5dv_devx_obj *mr; + void *mkc; + ucs_status_t status; + + ucs_print("umr_reg crosses address=%p length=%zu", address, length); + mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mem == NULL) { + ucs_error("mlx5dv_devx_umem_reg() failed: %m"); + status = UCS_ERR_NO_MEMORY; + goto err_out; + } + + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); + UCT_IB_MLX5DV_SET(create_mkey_in, in, pg_access, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_valid, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, mem->umem_id); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_MTT); + UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, 12); + UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, (intptr_t)address & 0xff); + UCT_IB_MLX5DV_SET(mkc, mkc, alter_pd_to_vhca_id, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, crossed_side_mkey, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_gvmi_id, allowed_gvmi_id); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)address); + UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); + + mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, + sizeof(out)); + if (mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); + status = UCS_ERR_UNSUPPORTED; + goto err_free; + } + + memh->super.lkey = + (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | + ((intptr_t)address & 0xff); + memh->super.rkey = memh->super.lkey; + + ucs_print("crossed mkey is %x", memh->super.lkey); + + status = UCS_OK; + +err_free: + mlx5dv_devx_umem_dereg(mem); + +err_out: + return status; +} + +static ucs_status_t +uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t target_gvmi_id, + uint32_t target_mkey, uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; + struct mlx5dv_pd dvpd = {}; + struct mlx5dv_obj dv = {}; + struct mlx5dv_devx_obj *mr; + void *mkc; + ucs_status_t status; + + dv.pd.in = md->super.pd; + dv.pd.out = &dvpd; + mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + + ucs_print("reg_crossin address=%p", address); + + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); + UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, 0); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x3); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_4_2, (UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x1C) >> 2); + UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_gvmi_id, target_gvmi_id); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_mkey, target_mkey); + UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, (intptr_t)address & 0xff); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)0); + UCT_IB_MLX5DV_SET(mkc, mkc, length64, 1); + + mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, + sizeof(out)); + if (mr == NULL) { + ucs_fatal("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); + status = UCS_ERR_UNSUPPORTED; + goto err_out; + } + + memh->super.lkey = + (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | + ((intptr_t)address & 0xff); + memh->super.rkey = memh->super.lkey; + + ucs_print("crossing mkey is %x", memh->super.lkey); + + status = UCS_OK; + +err_out: + return status; +} + static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .open = uct_ib_mlx5_devx_md_open, .cleanup = uct_ib_mlx5_devx_md_cleanup, @@ -855,6 +1005,8 @@ static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .dereg_multithreaded = uct_ib_mlx5_devx_dereg_multithreaded, .mem_prefetch = uct_ib_mlx5_mem_prefetch, .get_atomic_mr_id = uct_ib_mlx5_md_get_atomic_mr_id, + .reg_crossed_key = uct_ib_mlx5_devx_reg_crossed_key, + .reg_crossing_key = uct_ib_mlx5_devx_reg_crossing_key, }; UCT_IB_MD_OPS(uct_ib_mlx5_devx_md_ops, 2); diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am index 6872042f249..e81283aeeda 100644 --- a/test/apps/Makefile.am +++ b/test/apps/Makefile.am @@ -21,7 +21,8 @@ noinst_PROGRAMS = \ test_link_map \ test_dlopen_cfg_print \ test_init_mt \ - test_memtrack_limit + test_memtrack_limit \ + test_uct_xgvmi objdir = $(shell sed -n -e 's/^objdir=\(.*\)$$/\1/p' $(LIBTOOL)) @@ -64,6 +65,13 @@ test_init_mt_CPPFLAGS = $(BASE_CPPFLAGS) test_init_mt_CFLAGS = $(BASE_CFLAGS) $(OPENMP_CFLAGS) test_init_mt_LDADD = $(top_builddir)/src/ucp/libucp.la + +test_uct_xgvmi_SOURCES = test_uct_xgvmi.c +test_uct_xgvmi_CPPFLAGS = $(BASE_CPPFLAGS) +test_uct_xgvmi_CFLAGS = $(BASE_CFLAGS) +test_uct_xgvmi_LDADD = $(top_builddir)/src/ucp/libuct.la + + if HAVE_CUDA noinst_PROGRAMS += test_cuda_hook_dynamic diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index 3bdc2dcb37e..2c08cefdc81 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -753,6 +753,54 @@ UCS_TEST_SKIP_COND_P(test_md, dereg_bad_arg, EXPECT_UCS_OK(status); free(ptr); } +extern "C" { +#include +} +// TODO check MD cap flag +UCS_TEST_P(test_md, shared_rkey) +{ + static const size_t size = 1 * UCS_MBYTE; + ucs_status_t status; + uct_mem_h memh; + void *ptr; + + int ret = ucs_posix_memalign(&ptr, ucs_get_page_size(), size, "shared_buf"); + ASSERT_EQ(0, ret); + + int vhca_id = ((uct_ib_md_t*)md())->vhca_id; + + uct_md_mem_reg_shared_params_t reg_shared_params; + reg_shared_params.address = ptr; + reg_shared_params.length = size; + reg_shared_params.dest_gvmi = vhca_id; + + status = uct_md_mem_reg_shared(md(), ®_shared_params, &memh); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "registered shared memh"; + + std::vector rkey_buf; + rkey_buf.resize(md_attr().rkey_packed_size); + + status = uct_md_mkey_pack(md(), memh, &rkey_buf[0]); + ASSERT_UCS_OK(status); + + uct_rkey_bundle_t rkey_bundle; + status = uct_rkey_unpack(GetParam().component, &rkey_buf[0], &rkey_bundle); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "unpacked rkey"; + + uct_md_import_shared_rkey_params_t import_params; + import_params.rkey = rkey_bundle.rkey; + import_params.source_gvmi = vhca_id; // TODO + + uct_mem_h imported_memh; + status = uct_md_import_shared_rkey(md(), &import_params, &imported_memh); + ASSERT_UCS_OK(status); + + UCS_TEST_MESSAGE << "registered imported memh"; +} UCT_MD_INSTANTIATE_TEST_CASE(test_md) From fcb63c29174a9537fd47096970a3345118b81474 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 22:12:28 +0200 Subject: [PATCH 11/27] GTEST: Fix + hack to run the test without connection --- test/gtest/ucp/test_ucp_am.cc | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index 93a9fc3cddc..66e7f3dc9c8 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -41,8 +41,8 @@ class test_ucp_am_base : public ucp_test { modify_config("MAX_EAGER_LANES", "2"); ucp_test::init(); - sender().connect(&receiver(), get_ep_params()); - receiver().connect(&sender(), get_ep_params()); + // sender().connect(&receiver(), get_ep_params()); + // receiver().connect(&sender(), get_ep_params()); } protected: @@ -1288,10 +1288,12 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { { ucp_mem_h memh; ucp_mem_map_params_t mparams; - mparams.field_mask = UCP_MEM_MAP_PARAM_FIELD_LENGTH | + mparams.field_mask = UCP_MEM_MAP_PARAM_FIELD_LENGTH | + UCP_MEM_MAP_PARAM_FIELD_PEER_ID | UCP_MEM_MAP_PARAM_FIELD_FLAGS; mparams.address = NULL; mparams.length = length; + mparams.peer_id = 70; mparams.flags = UCP_MEM_MAP_ALLOCATE | UCP_MEM_MAP_SHARED; ASSERT_UCS_OK(ucp_mem_map(sender().ucph(), &mparams, &memh)); @@ -1408,14 +1410,14 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { &self->m_rx_memh, &self->m_imp_memh); ucp_request_param_t op_param; - op_param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH | - UCP_OP_ATTR_FIELD_CALLBACK | - UCP_OP_ATTR_FLAG_NO_IMM_CMPL; - op_param.memh = self->m_imp_memh; - op_param.cb.recv_am = am_data_recv_cb; - ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(self->receiver().worker(), - data, address, length, - &op_param); + op_param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH | + UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FLAG_NO_IMM_CMPL; + op_param.memh = self->m_imp_memh; + op_param.cb.recv_am = am_data_recv_cb; + ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(self->receiver().worker(), + data, address, length, + &op_param); ucp_request_release(rptr); return UCS_INPROGRESS; @@ -1535,6 +1537,7 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, shared_mkey) void *address = alloc_memhs(sender().ucph(), length, &exp_memh, &imp_memh); ASSERT_TRUE(address != NULL); +#if 0 ucp_request_param_t param; param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH; param.memh = imp_memh; @@ -1543,7 +1546,7 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, shared_mkey) EXPECT_EQ(m_status, request_wait(sptr)); EXPECT_TRUE(m_am_received); - +#endif free_memhs(sender().ucph(), exp_memh, imp_memh); } From 2eb3f959f259e8352733355d7a732a63c24bc2c0 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 22:41:26 +0200 Subject: [PATCH 12/27] UCP/GTEST: Hack to unpack rkey on worker --- src/ucp/api/ucp.h | 2 + src/ucp/core/ucp_rkey.c | 124 +++++++++++++++++++++++++++++++++- test/gtest/ucp/test_ucp_am.cc | 2 +- 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index a8913ff0d85..91673d384b6 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -3017,6 +3017,8 @@ void ucp_rkey_buffer_release(void *rkey_buffer); ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, const void *rkey_buffer, ucp_rkey_h *rkey_p); +ucs_status_t ucp_worker_rkey_unpack(ucp_worker_h worker, const void *rkey_buffer, + ucp_rkey_h *rkey_p); /** * @ingroup UCP_MEM diff --git a/src/ucp/core/ucp_rkey.c b/src/ucp/core/ucp_rkey.c index 99b1f1c82b5..1202d0e9dd9 100644 --- a/src/ucp/core/ucp_rkey.c +++ b/src/ucp/core/ucp_rkey.c @@ -467,7 +467,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack_internal, goto err_destroy; } } else { - ucp_rkey_resolve_inner(rkey, ep); + // ucp_rkey_resolve_inner(rkey, ep); } ucs_trace("ep %p: unpacked rkey %p md_map 0x%" PRIx64 " type %s", ep, rkey, @@ -495,6 +495,128 @@ ucs_status_t ucp_ep_rkey_unpack(ucp_ep_h ep, const void *rkey_buffer, return status; } +ucs_status_t ucp_worker_rkey_unpack(ucp_worker_h worker, const void *buffer, + ucp_rkey_h *rkey_p) +{ + const void *p = buffer; + size_t length = 0; + ucp_md_map_t md_map, remote_md_map; + unsigned remote_md_index; + const void *tl_rkey_buf; + ucp_tl_rkey_t *tl_rkey; + size_t tl_rkey_size; + unsigned rkey_index; + ucs_status_t status; + ucp_rkey_h rkey; + uint8_t flags; + int md_count; + ucp_tl_md_t *tl_md; + + UCS_STATIC_ASSERT(ucs_offsetof(ucp_rkey_t, mem_type) == + ucs_offsetof(ucp_rkey_t, cache.mem_type)); + UCS_STATIC_ASSERT(ucs_same_type(ucs_field_type(ucp_rkey_t, mem_type), + ucs_field_type(ucp_rkey_t, cache.mem_type))); + + ucs_trace("unpacking rkey buffer %p length %zu", buffer, length); + ucs_log_indent(1); + + /* MD map for the unpacked rkey */ + remote_md_map = *ucs_serialize_next(&p, const ucp_md_map_t); + md_map = remote_md_map ;//& ucp_ep_config(ep)->key.reachable_md_map; + md_count = ucs_popcount(md_map); + + /* Allocate rkey handle which holds UCT rkeys for all remote MDs. Small key + * allocations are done from a memory pool. + * We keep all of them to handle a future transport switch. + */ + if (md_count <= worker->context->config.ext.rkey_mpool_max_md) { + rkey = ucs_mpool_get_inline(&worker->rkey_mp); + flags = UCP_RKEY_DESC_FLAG_POOL; + } else { + rkey = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->tl_rkey[0]) * md_count), + "ucp_rkey"); + flags = 0; + } + if (rkey == NULL) { + ucs_error("failed to allocate remote key"); + status = UCS_ERR_NO_MEMORY; + goto out; + } + + rkey->md_map = md_map; + rkey->mem_type = *ucs_serialize_next(&p, const uint8_t); + rkey->peer_id = *ucs_serialize_next(&p, ucp_rsc_index_t); + rkey->flags = flags; +#if 0 + rkey->ep = ep; +#endif + + /* Go over remote MD indices and unpack rkey of each UCT MD */ + rkey_index = 0; /* Index of the rkey in the array */ + ucs_for_each_bit(remote_md_index, remote_md_map) { + tl_rkey_size = *ucs_serialize_next(&p, const uint8_t); + tl_rkey_buf = ucs_serialize_next_raw(&p, const void, tl_rkey_size); + + /* Use bit operations to iterate through the indices of the remote MDs + * as provided in the md_map. md_map always holds a bitmap of MD indices + * that remain to be used. Every time we find the next valid MD index. + * If some rkeys cannot be unpacked, we remove them from the local map. + */ + ucs_assert(UCS_BIT(remote_md_index) & remote_md_map); + ucs_assert_always(remote_md_index <= UCP_MD_INDEX_BITS); + + /* Unpack only reachable rkeys */ + if (!(UCS_BIT(remote_md_index) & rkey->md_map)) { + continue; + } + + ucs_assert(rkey_index < md_count); + tl_rkey = &rkey->tl_rkey[rkey_index]; + tl_md = &worker->context->tl_mds[remote_md_index]; + tl_rkey->cmpt = worker->context->tl_cmpts[tl_md->cmpt_index].cmpt; + + status = uct_rkey_unpack(tl_rkey->cmpt, tl_rkey_buf, &tl_rkey->rkey); + if (status == UCS_OK) { + ucs_trace("rkey[%d] for remote md %d is 0x%lx", rkey_index, + remote_md_index, tl_rkey->rkey.rkey); + ++rkey_index; + } else if (status == UCS_ERR_UNREACHABLE) { + rkey->md_map &= ~UCS_BIT(remote_md_index); + ucs_trace("rkey[%d] for remote md %d is 0x%lx not reachable", + rkey_index, remote_md_index, tl_rkey->rkey.rkey); + } else { + ucs_error("failed to unpack remote key from remote md[%d]: %s", + remote_md_index, ucs_status_string(status)); + goto err_destroy; + } + } + +#if 0 + if (worker->context->config.ext.proto_enable) { + status = ucp_rkey_proto_resolve(rkey, ep, p, + UCS_PTR_BYTE_OFFSET(buffer, length)); + if (status != UCS_OK) { + goto err_destroy; + } + } else { + // ucp_rkey_resolve_inner(rkey, ep); + } +#endif + + ucs_trace("unpacked rkey %p md_map 0x%" PRIx64 " type %s", rkey, + rkey->md_map, ucs_memory_type_names[rkey->mem_type]); + *rkey_p = rkey; + status = UCS_OK; + goto out; + +err_destroy: + ucp_rkey_destroy(rkey); +out: + ucs_log_indent(-1); + return status; + +} + void ucp_rkey_dump_packed(const void *buffer, size_t length, ucs_string_buffer_t *strb) { diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index 66e7f3dc9c8..cb2d037da1f 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -1314,7 +1314,7 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { ASSERT_UCS_OK(ucp_rkey_pack(context, memh, &rkey_buf, &rkey_buf_size)); - ASSERT_UCS_OK(ucp_ep_rkey_unpack(sender().ep(), rkey_buf, + ASSERT_UCS_OK(ucp_worker_rkey_unpack(sender().worker(), rkey_buf, &mparams.rkey)); ucp_rkey_buffer_release(rkey_buf); From fe2f0350a5224e2ac7a1f2939e54599eebf9c35c Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 23:50:56 +0200 Subject: [PATCH 13/27] WIP5 --- src/uct/ib/base/ib_md.c | 8 +++----- src/uct/ib/base/ib_md.h | 2 -- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 31 +++++++++++++++++-------------- test/apps/test_uct_xgvmi.c | 18 +++++++++++++----- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 9a24942ce95..b0f405136ff 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -854,7 +854,6 @@ uct_ib_md_mem_reg_shared(uct_md_h uct_md, uct_md_mem_reg_shared_params_t *params uct_ib_mem_t *ib_memh; ucs_status_t status; - // return md->ops->mem_reg_shared(md, params, memh_p); ib_memh = uct_ib_memh_alloc(md); status = md->ops->reg_crossed_key(md, params->address, params->length, params->dest_gvmi, ib_memh); @@ -876,11 +875,10 @@ uct_ib_md_import_shared_rkey(uct_md_h uct_md, uct_ib_mem_t *ib_memh; ucs_status_t status; - // return md->ops->import_shared_rkey(md, params, memh_p); ib_memh = uct_ib_memh_alloc(md); - status = md->ops->reg_crossing_key(md, NULL, 0, params->source_gvmi, - uct_ib_md_direct_rkey(params->rkey), - ib_memh); + status = md->ops->reg_crossing_key(md, params->source_gvmi, + uct_ib_md_direct_rkey(params->rkey), + ib_memh); if (status != UCS_OK) { uct_ib_memh_free(ib_memh); return status; diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 6fd7c109027..c11e2a4ef06 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -363,8 +363,6 @@ typedef ucs_status_t (*uct_ib_md_reg_crossed_key_func_t)(uct_ib_md_t *ib_md, * @return UCS_OK on success or error code in case of failure. */ typedef ucs_status_t (*uct_ib_md_reg_crossing_key_func_t)(uct_ib_md_t *ib_md, - void *address, - size_t length, uint32_t target_gvmi_id, uint32_t target_mkey, uct_ib_mem_t *ib_memh); diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 45bd126c840..50d30a4c7c5 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -755,11 +755,11 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, crossing_vhca_mkey)) { ucs_info("%s: vhca_id=%d crossing_vhca_mkey is supported", uct_ib_device_name(dev), md->super.vhca_id); - md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; } else { ucs_info("%s: vhca_id=%d crossing_vhca_mkey is not supported", uct_ib_device_name(dev), md->super.vhca_id); } + md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); if (status != UCS_OK) { @@ -861,6 +861,8 @@ static void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd) ucs_recursive_spinlock_destroy(&md->dbrec_lock); } +#define UCT_IB_CROSS_KEY_IDX 0xcc + static ucs_status_t uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, size_t length, uint32_t allowed_gvmi_id, @@ -900,7 +902,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, 1); UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, 12); UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); - UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, (intptr_t)address & 0xff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, UCT_IB_CROSS_KEY_IDX); UCT_IB_MLX5DV_SET(mkc, mkc, alter_pd_to_vhca_id, 1); UCT_IB_MLX5DV_SET(mkc, mkc, crossed_side_mkey, 1); UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_gvmi_id, allowed_gvmi_id); @@ -916,9 +918,9 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, goto err_free; } - memh->super.lkey = - (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | - ((intptr_t)address & 0xff); + memh->super.lkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) + << 8) | + UCT_IB_CROSS_KEY_IDX; memh->super.rkey = memh->super.lkey; ucs_print("crossed mkey is %x", memh->super.lkey); @@ -933,8 +935,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, } static ucs_status_t -uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, - size_t length, uint32_t target_gvmi_id, +uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, uint32_t target_mkey, uct_ib_mem_t *ib_memh) { uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); @@ -951,14 +952,16 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); - ucs_print("reg_crossin address=%p", address); + ucs_print("reg crossing target_mkey=0x%x", target_mkey); mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, 0); - UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x3); - UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_4_2, (UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x1C) >> 2); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, + UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x3); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_4_2, + (UCT_IB_MLX5_MKC_ACCESS_MODE_CROSSING_VHCA & 0x1C) >> 2); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); @@ -968,7 +971,7 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_gvmi_id, target_gvmi_id); UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_mkey, target_mkey); UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); - UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, (intptr_t)address & 0xff); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, UCT_IB_CROSS_KEY_IDX); UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)0); UCT_IB_MLX5DV_SET(mkc, mkc, length64, 1); @@ -981,9 +984,9 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, goto err_out; } - memh->super.lkey = - (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | - ((intptr_t)address & 0xff); + memh->super.lkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) + << 8) | + UCT_IB_CROSS_KEY_IDX; memh->super.rkey = memh->super.lkey; ucs_print("crossing mkey is %x", memh->super.lkey); diff --git a/test/apps/test_uct_xgvmi.c b/test/apps/test_uct_xgvmi.c index 6a864a3bd08..835782d3ea0 100644 --- a/test/apps/test_uct_xgvmi.c +++ b/test/apps/test_uct_xgvmi.c @@ -30,6 +30,7 @@ typedef struct { const char *md_name; int gvmi_id; size_t size; + size_t align; uint32_t mkey; } cmd_args_t; @@ -108,7 +109,7 @@ void do_export(uct_md_h md, uct_component_h component, void *ptr; int ret; - ret = posix_memalign(&ptr, 65536, cmd_args->size); + ret = posix_memalign(&ptr, cmd_args->align, cmd_args->size); CHKERR_JUMP(0 != ret, "allocate memory", error_ret); reg_shared_params.address = ptr; @@ -124,8 +125,8 @@ void do_export(uct_md_h md, uct_component_h component, status = uct_rkey_unpack(component, rkey_buf, &rkey_bundle); CHKERR_JUMP(UCS_OK != status, "uct_rkey_unpack", error_ret); - printf("shared rkey 0x%x for gvmi %d\n", (uint32_t)rkey_bundle.rkey, - cmd_args->gvmi_id); + printf("shared ptr %p len %zu rkey 0x%x towards gvmi %d\n", ptr, + cmd_args->size, (uint32_t)rkey_bundle.rkey, cmd_args->gvmi_id); printf("press any key to continue\n"); getchar(); @@ -168,8 +169,9 @@ int main(int argc, char** argv) args.gvmi_id = 0; args.mkey = 0; args.size = 1024 * 1024; + args.align = 65536; - while ((c = getopt(argc, argv, "d:g:i:")) != -1) { + while ((c = getopt(argc, argv, "d:g:i:s:a:")) != -1) { switch (c) { case 'd': args.md_name = optarg; @@ -180,8 +182,14 @@ int main(int argc, char** argv) case 'i': args.mkey = strtol(optarg, NULL, 0); break; + case 's': + args.size = strtoll(optarg, NULL, 0); + break; + case 'a': + args.align = strtoll(optarg, NULL, 0); + break; default: - printf("Usage: %s [-d ] [-g ] [-i ]\n", + printf("Usage: %s [-d ] [-g ] [-i ] [ -s size ] [ -a align ]\n", argv[0]); return -1; } From 55da8e5c847aae85e927d5b7b56adc839eeaf449 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 22:58:35 +0000 Subject: [PATCH 14/27] Fix UCT rcache --- src/uct/ib/base/ib_md.c | 13 +++-- src/uct/ib/base/ib_md.h | 1 + src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 86 +++++++++++++++++++++++++------ test/apps/test_uct_xgvmi.c | 11 ++++ 4 files changed, 91 insertions(+), 20 deletions(-) diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index b0f405136ff..4906d62dff9 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -862,6 +862,7 @@ uct_ib_md_mem_reg_shared(uct_md_h uct_md, uct_md_mem_reg_shared_params_t *params return status; } + ib_memh->flags |= UCT_IB_MEM_FLAG_NO_RCACHE; *memh_p = ib_memh; return UCS_OK; } @@ -884,6 +885,7 @@ uct_ib_md_import_shared_rkey(uct_md_h uct_md, return status; } + ib_memh->flags |= UCT_IB_MEM_FLAG_NO_RCACHE; *memh_p = ib_memh; return UCS_OK; } @@ -1064,11 +1066,16 @@ static ucs_status_t uct_ib_mem_rcache_dereg(uct_md_h uct_md, const uct_md_mem_dereg_params_t *params) { - uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *ib_memh = (uct_ib_mem_t*)params->memh; uct_ib_rcache_region_t *region; UCT_MD_MEM_DEREG_CHECK_PARAMS(params, 1); + if (ib_memh->flags & UCT_IB_MEM_FLAG_NO_RCACHE) { + return uct_ib_mem_dereg(uct_md, params); + } + region = uct_ib_rcache_region_from_memh(params->memh); if (UCT_MD_MEM_DEREG_FIELD_VALUE(params, flags, FIELD_FLAGS, 0) & UCT_MD_MEM_DEREG_FLAG_INVALIDATE) { @@ -1086,8 +1093,8 @@ static uct_md_ops_t uct_ib_md_rcache_ops = { .query = uct_ib_md_query, .mem_reg = uct_ib_mem_rcache_reg, .mem_dereg = uct_ib_mem_rcache_dereg, - .mem_reg_shared = uct_ib_md_mem_reg_shared, - .import_shared_rkey = uct_ib_md_import_shared_rkey, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .is_sockaddr_accessible = ucs_empty_function_return_zero_int, diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index c11e2a4ef06..7242019e9bf 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -54,6 +54,7 @@ enum { UCT_IB_MEM_FLAG_RELAXED_ORDERING = UCS_BIT(4), /**< The memory region will issue PCIe writes with relaxed order attribute */ + UCT_IB_MEM_FLAG_NO_RCACHE = UCS_BIT(5) }; enum { diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 50d30a4c7c5..9a45f7103db 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -26,10 +26,19 @@ typedef union uct_ib_mlx5_mr { uct_ib_mlx5_ksm_data_t *ksm_data; } uct_ib_mlx5_mr_t; +typedef enum { + UCT_IB_MLX5_MEM_REG, + UCT_IB_MLX5_MEM_CROSSED, + UCT_IB_MLX5_MEM_CROSSING +} uct_ib_mlx5_mem_type_t; + typedef struct uct_ib_mlx5_mem { uct_ib_mem_t super; + uct_ib_mlx5_mem_type_t type; #if HAVE_DEVX struct mlx5dv_devx_obj *atomic_dvmr; + struct mlx5dv_devx_umem *umem; + struct mlx5dv_devx_obj *cross_mr; #endif uct_ib_mlx5_mr_t mrs[]; } uct_ib_mlx5_mem_t; @@ -43,6 +52,9 @@ static ucs_status_t uct_ib_mlx5_reg_key(uct_ib_md_t *md, void *address, { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + memh->type = UCT_IB_MLX5_MEM_REG; + + ucs_print("reg key %p type %d", memh, mr_type); return uct_ib_reg_key_impl(md, address, length, access_flags, ib_memh, &memh->mrs[mr_type].super, mr_type, silent); } @@ -52,8 +64,41 @@ static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, uct_ib_mr_type_t mr_type) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + int ret; + + ucs_print("de-reg key %p type %d", memh, mr_type); + switch (memh->type) { + case UCT_IB_MLX5_MEM_REG: + return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); + case UCT_IB_MLX5_MEM_CROSSED: + if (mr_type != UCT_IB_MR_DEFAULT) { + return UCS_OK; + } + ret = mlx5dv_devx_obj_destroy(memh->cross_mr); + if (ret < 0) { + ucs_warn("mlx5dv_devx_obj_destroy(crossmr) failed: %m"); + return UCS_ERR_IO_ERROR; + } - return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); + ret = mlx5dv_devx_umem_dereg(memh->umem); + if (ret < 0) { + ucs_warn("mlx5dv_devx_umem_dereg(crossmr) failed: %m"); + return UCS_ERR_IO_ERROR; + } + return UCS_OK; + case UCT_IB_MLX5_MEM_CROSSING: + if (mr_type != UCT_IB_MR_DEFAULT) { + return UCS_OK; + } + ret = mlx5dv_devx_obj_destroy(memh->cross_mr); + if (ret < 0) { + ucs_warn("mlx5dv_devx_obj_destroy(crossmr) failed: %m"); + return UCS_ERR_IO_ERROR; + } + return UCS_OK; + default: + return UCS_ERR_INVALID_PARAM; + } } static ucs_status_t uct_ib_mlx5_reg_atomic_key(uct_ib_md_t *ibmd, @@ -872,16 +917,19 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; - struct mlx5dv_devx_umem *mem; - struct mlx5dv_devx_obj *mr; void *mkc; ucs_status_t status; + ucs_print("reg key %p crossed", memh); + + memh->type = UCT_IB_MLX5_MEM_CROSSED; + ucs_print("umr_reg crosses address=%p length=%zu", address, length); - mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | - IBV_ACCESS_REMOTE_WRITE); - if (mem == NULL) { + memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (memh->umem == NULL) { ucs_error("mlx5dv_devx_umem_reg() failed: %m"); status = UCS_ERR_NO_MEMORY; goto err_out; @@ -892,7 +940,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); UCT_IB_MLX5DV_SET(create_mkey_in, in, pg_access, 1); UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_valid, 1); - UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, mem->umem_id); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, memh->umem->umem_id); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_MTT); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); @@ -909,9 +957,9 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)address); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); - mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, - sizeof(out)); - if (mr == NULL) { + memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + sizeof(in), out, sizeof(out)); + if (memh->cross_mr == NULL) { ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); status = UCS_ERR_UNSUPPORTED; @@ -928,7 +976,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, status = UCS_OK; err_free: - mlx5dv_devx_umem_dereg(mem); + mlx5dv_devx_umem_dereg(memh->umem); err_out: return status; @@ -944,10 +992,12 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; struct mlx5dv_pd dvpd = {}; struct mlx5dv_obj dv = {}; - struct mlx5dv_devx_obj *mr; void *mkc; ucs_status_t status; + ucs_print("reg key %p crossing", memh); + memh->type = UCT_IB_MLX5_MEM_CROSSING; + dv.pd.in = md->super.pd; dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); @@ -975,10 +1025,10 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)0); UCT_IB_MLX5DV_SET(mkc, mkc, length64, 1); - mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, - sizeof(out)); - if (mr == NULL) { - ucs_fatal("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + sizeof(in), out, sizeof(out)); + if (memh->cross_mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); status = UCS_ERR_UNSUPPORTED; goto err_out; @@ -1220,6 +1270,8 @@ static uct_ib_md_ops_t uct_ib_mlx5_md_ops = { .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported, .mem_prefetch = uct_ib_mlx5_mem_prefetch, .get_atomic_mr_id = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported, + .reg_crossed_key = uct_ib_mlx5_devx_reg_crossed_key, + .reg_crossing_key = uct_ib_mlx5_devx_reg_crossing_key, }; UCT_IB_MD_OPS(uct_ib_mlx5_md_ops, 1); diff --git a/test/apps/test_uct_xgvmi.c b/test/apps/test_uct_xgvmi.c index 835782d3ea0..dc64bf40d83 100644 --- a/test/apps/test_uct_xgvmi.c +++ b/test/apps/test_uct_xgvmi.c @@ -130,6 +130,14 @@ void do_export(uct_md_h md, uct_component_h component, printf("press any key to continue\n"); getchar(); + status = uct_rkey_release(component, &rkey_bundle); + CHKERR_JUMP(UCS_OK != status, "uct_rkey_release", error_ret); + + status = uct_md_mem_dereg(md, memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_mem_dereg", error_ret); + + free(ptr); + error_ret: ; } @@ -152,6 +160,9 @@ void do_import(uct_md_h md, uct_component_h component, printf("imported shared rkey memh=%p\n", memh); + status = uct_md_mem_dereg(md, memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_mem_dereg", error_ret); + error_ret: ; } From c9d7bd0dcc2147b97c95ce1a14ac923d68710f8b Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Mon, 21 Feb 2022 23:16:25 +0000 Subject: [PATCH 15/27] UCP: Prints and bug fixes --- src/ucp/core/ucp_mm.c | 28 ++++++++++++++++++++++++---- src/ucp/core/ucp_mm.h | 1 + src/ucp/core/ucp_mm.inl | 2 ++ src/ucp/core/ucp_rkey.c | 4 ++-- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 3 ++- 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 40e41a0e16a..8c0be23b423 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -362,6 +362,8 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, status = uct_md_mem_reg_shared(context->tl_mds[md_index].md, ®_shared_params, &memh->uct[md_index]); + ucs_print("reg shared memh: addr %p, len %zu, gvmi %d, status %s", + address, length, memh->peer_id, ucs_status_string(status)); } else { ucs_assert_always(memh->peer_id == UCP_NULL_RESOURCE); status = uct_md_mem_reg(context->tl_mds[md_index].md, @@ -382,6 +384,10 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, md_map_registered |= UCS_BIT(md_index); } + ucs_print("Registered %s memh, mdmap 0x%lx (cur 0x%lx) gvmi %d", + ((uct_flags & UCT_MD_FLAG_SHARED_RKEY) ? "shared":""), + md_map_registered,memh->md_map, memh->peer_id); + memh->md_map |= md_map_registered; return UCS_OK; } @@ -408,7 +414,7 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, } if (context->rcache == NULL) { - memh = ucs_calloc(1, sizeof(ucp_mem_h) + + memh = ucs_calloc(1, sizeof(*memh) + (sizeof(uct_mem_h) * context->num_mds), "ucp rcache"); if (memh == NULL) { return UCS_ERR_NO_MEMORY; @@ -478,6 +484,7 @@ ucp_memh_alloc(ucp_context_h context, void *address, size_t length, status = ucp_memh_get_slow(context, mem.address, mem.length, mem.mem_type, reg_md_map, uct_flags, peer_id, &memh); + memh->imported = 0; } else { status = ucp_memh_import(context, rkey, address, length, &memh); } @@ -523,6 +530,7 @@ ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, return UCS_ERR_NO_MEMORY; } + ucs_print("ucp_memh_import, rkey mdmap 0x%lx rkey gvmi %d", rkey->md_map, rkey->peer_id); ucs_for_each_bit(md_index, rkey->md_map) { md_attr = &context->tl_mds[md_index].attr; @@ -530,6 +538,10 @@ ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, import_params.rkey = rkey->tl_rkey[md_index].rkey.rkey; import_params.source_gvmi = rkey->peer_id; + ucs_print("registering address %p length %zu on md[%d]=%s gvmi %d rkey %lx", + address, length, md_index, + context->tl_mds[md_index].rsc.md_name, + import_params.source_gvmi, import_params.rkey); status = uct_md_import_shared_rkey(context->tl_mds[md_index].md, &import_params, &memh->uct[md_index]); @@ -548,8 +560,9 @@ ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, md_map_registered |= UCS_BIT(md_index); } - memh->md_map |= md_map_registered; - *memh_p = memh; + memh->md_map = md_map_registered; + memh->imported = 1; + *memh_p = memh; return UCS_OK; } @@ -666,7 +679,14 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para ucs_status_t ucp_mem_unmap(ucp_context_h context, ucp_mem_h memh) { UCP_THREAD_CS_ENTER(&context->mt_lock); - ucp_memh_put(context, memh, 1); + ucs_print("unreg memh %p, map md0x%lx, imported %d", + memh, memh->md_map, memh->imported); + if (memh->imported) { + ucp_memh_dereg(context, memh, memh->md_map); + ucs_free(memh); + } else { + ucp_memh_put(context, memh, 1); + } UCP_THREAD_CS_EXIT(&context->mt_lock); return UCS_OK; } diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index aa4f6b48938..0d40db3da5c 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -32,6 +32,7 @@ typedef struct ucp_mem { ucp_md_index_t alloc_md_index; /* Index of MD used to allocated the memory */ ucp_md_map_t md_map; /* Which MDs have valid memory handles */ ucp_rsc_index_t peer_id; /* Peer id for shared memh */ + int imported; uct_mem_h uct[0]; /* Sparse memory handles array num_mds in size */ } ucp_mem_t; diff --git a/src/ucp/core/ucp_mm.inl b/src/ucp/core/ucp_mm.inl index 0c4443b1ca9..36c1f7a5134 100644 --- a/src/ucp/core/ucp_mm.inl +++ b/src/ucp/core/ucp_mm.inl @@ -38,6 +38,8 @@ ucp_memh_get(ucp_context_h context, void *address, size_t length, return status; } return UCS_OK; + } else { + (*memh_p)->imported = 0; } if (ucs_likely(context->rcache != NULL)) { diff --git a/src/ucp/core/ucp_rkey.c b/src/ucp/core/ucp_rkey.c index 1202d0e9dd9..a86561ec5cc 100644 --- a/src/ucp/core/ucp_rkey.c +++ b/src/ucp/core/ucp_rkey.c @@ -57,7 +57,7 @@ size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map, size += sizeof(uint8_t); /* Memory type */ /* Always include shared key info for now */ - size += sizeof(uint32_t); /* Peer id */ + size += sizeof(uint8_t); /* Peer id */ ucs_for_each_bit(md_index, md_map) { tl_rkey_size = context->tl_mds[md_index].attr.rkey_packed_size; @@ -232,7 +232,7 @@ ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh, /* always acquire context lock */ UCP_THREAD_CS_ENTER(&context->mt_lock); - ucs_trace("packing rkeys for buffer %p memh %p md_map 0x%"PRIx64, + ucs_print("packing rkeys for buffer %p memh %p md_map 0x%"PRIx64, ucp_memh_address(memh), memh, memh->md_map); if (ucp_memh_is_zero_length(memh)) { diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 45bd126c840..481fc78dc43 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -951,7 +951,8 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); - ucs_print("reg_crossin address=%p", address); + ucs_print("reg_crossin address=%p target mkey 0x%x gvmi id %d", + address, target_mkey, target_gvmi_id); mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); From b31b8544e59252541499999ed61ce62a044b9175 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Mon, 21 Feb 2022 22:58:35 +0000 Subject: [PATCH 16/27] Fix UCT rcache Conflicts: test/apps/test_uct_xgvmi.c --- src/uct/ib/base/ib_md.c | 13 +- src/uct/ib/base/ib_md.h | 1 + src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 86 +++++++++--- test/apps/test_uct_xgvmi.c | 224 ++++++++++++++++++++++++++++++ 4 files changed, 304 insertions(+), 20 deletions(-) create mode 100644 test/apps/test_uct_xgvmi.c diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 9a24942ce95..c9314080eef 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -863,6 +863,7 @@ uct_ib_md_mem_reg_shared(uct_md_h uct_md, uct_md_mem_reg_shared_params_t *params return status; } + ib_memh->flags |= UCT_IB_MEM_FLAG_NO_RCACHE; *memh_p = ib_memh; return UCS_OK; } @@ -886,6 +887,7 @@ uct_ib_md_import_shared_rkey(uct_md_h uct_md, return status; } + ib_memh->flags |= UCT_IB_MEM_FLAG_NO_RCACHE; *memh_p = ib_memh; return UCS_OK; } @@ -1066,11 +1068,16 @@ static ucs_status_t uct_ib_mem_rcache_dereg(uct_md_h uct_md, const uct_md_mem_dereg_params_t *params) { - uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + uct_ib_mem_t *ib_memh = (uct_ib_mem_t*)params->memh; uct_ib_rcache_region_t *region; UCT_MD_MEM_DEREG_CHECK_PARAMS(params, 1); + if (ib_memh->flags & UCT_IB_MEM_FLAG_NO_RCACHE) { + return uct_ib_mem_dereg(uct_md, params); + } + region = uct_ib_rcache_region_from_memh(params->memh); if (UCT_MD_MEM_DEREG_FIELD_VALUE(params, flags, FIELD_FLAGS, 0) & UCT_MD_MEM_DEREG_FLAG_INVALIDATE) { @@ -1088,8 +1095,8 @@ static uct_md_ops_t uct_ib_md_rcache_ops = { .query = uct_ib_md_query, .mem_reg = uct_ib_mem_rcache_reg, .mem_dereg = uct_ib_mem_rcache_dereg, - .mem_reg_shared = uct_ib_md_mem_reg_shared, - .import_shared_rkey = uct_ib_md_import_shared_rkey, + .mem_reg_shared = uct_ib_md_mem_reg_shared, + .import_shared_rkey = uct_ib_md_import_shared_rkey, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, .is_sockaddr_accessible = ucs_empty_function_return_zero_int, diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 6fd7c109027..a69f2f9512f 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -54,6 +54,7 @@ enum { UCT_IB_MEM_FLAG_RELAXED_ORDERING = UCS_BIT(4), /**< The memory region will issue PCIe writes with relaxed order attribute */ + UCT_IB_MEM_FLAG_NO_RCACHE = UCS_BIT(5) }; enum { diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 481fc78dc43..57822d346a9 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -26,10 +26,19 @@ typedef union uct_ib_mlx5_mr { uct_ib_mlx5_ksm_data_t *ksm_data; } uct_ib_mlx5_mr_t; +typedef enum { + UCT_IB_MLX5_MEM_REG, + UCT_IB_MLX5_MEM_CROSSED, + UCT_IB_MLX5_MEM_CROSSING +} uct_ib_mlx5_mem_type_t; + typedef struct uct_ib_mlx5_mem { uct_ib_mem_t super; + uct_ib_mlx5_mem_type_t type; #if HAVE_DEVX struct mlx5dv_devx_obj *atomic_dvmr; + struct mlx5dv_devx_umem *umem; + struct mlx5dv_devx_obj *cross_mr; #endif uct_ib_mlx5_mr_t mrs[]; } uct_ib_mlx5_mem_t; @@ -43,6 +52,9 @@ static ucs_status_t uct_ib_mlx5_reg_key(uct_ib_md_t *md, void *address, { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + memh->type = UCT_IB_MLX5_MEM_REG; + + ucs_print("reg key %p type %d", memh, mr_type); return uct_ib_reg_key_impl(md, address, length, access_flags, ib_memh, &memh->mrs[mr_type].super, mr_type, silent); } @@ -52,8 +64,41 @@ static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, uct_ib_mr_type_t mr_type) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + int ret; + + ucs_print("de-reg key %p type %d", memh, mr_type); + switch (memh->type) { + case UCT_IB_MLX5_MEM_REG: + return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); + case UCT_IB_MLX5_MEM_CROSSED: + if (mr_type != UCT_IB_MR_DEFAULT) { + return UCS_OK; + } + ret = mlx5dv_devx_obj_destroy(memh->cross_mr); + if (ret < 0) { + ucs_warn("mlx5dv_devx_obj_destroy(crossmr) failed: %m"); + return UCS_ERR_IO_ERROR; + } - return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); + ret = mlx5dv_devx_umem_dereg(memh->umem); + if (ret < 0) { + ucs_warn("mlx5dv_devx_umem_dereg(crossmr) failed: %m"); + return UCS_ERR_IO_ERROR; + } + return UCS_OK; + case UCT_IB_MLX5_MEM_CROSSING: + if (mr_type != UCT_IB_MR_DEFAULT) { + return UCS_OK; + } + ret = mlx5dv_devx_obj_destroy(memh->cross_mr); + if (ret < 0) { + ucs_warn("mlx5dv_devx_obj_destroy(crossmr) failed: %m"); + return UCS_ERR_IO_ERROR; + } + return UCS_OK; + default: + return UCS_ERR_INVALID_PARAM; + } } static ucs_status_t uct_ib_mlx5_reg_atomic_key(uct_ib_md_t *ibmd, @@ -870,16 +915,19 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; - struct mlx5dv_devx_umem *mem; - struct mlx5dv_devx_obj *mr; void *mkc; ucs_status_t status; + ucs_print("reg key %p crossed", memh); + + memh->type = UCT_IB_MLX5_MEM_CROSSED; + ucs_print("umr_reg crosses address=%p length=%zu", address, length); - mem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | - IBV_ACCESS_REMOTE_WRITE); - if (mem == NULL) { + memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (memh->umem == NULL) { ucs_error("mlx5dv_devx_umem_reg() failed: %m"); status = UCS_ERR_NO_MEMORY; goto err_out; @@ -890,7 +938,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); UCT_IB_MLX5DV_SET(create_mkey_in, in, pg_access, 1); UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_valid, 1); - UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, mem->umem_id); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, memh->umem->umem_id); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_MTT); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); @@ -907,9 +955,9 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)address); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); - mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, - sizeof(out)); - if (mr == NULL) { + memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + sizeof(in), out, sizeof(out)); + if (memh->cross_mr == NULL) { ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); status = UCS_ERR_UNSUPPORTED; @@ -926,7 +974,7 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, status = UCS_OK; err_free: - mlx5dv_devx_umem_dereg(mem); + mlx5dv_devx_umem_dereg(memh->umem); err_out: return status; @@ -943,10 +991,12 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; struct mlx5dv_pd dvpd = {}; struct mlx5dv_obj dv = {}; - struct mlx5dv_devx_obj *mr; void *mkc; ucs_status_t status; + ucs_print("reg key %p crossing", memh); + memh->type = UCT_IB_MLX5_MEM_CROSSING; + dv.pd.in = md->super.pd; dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); @@ -973,10 +1023,10 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)0); UCT_IB_MLX5DV_SET(mkc, mkc, length64, 1); - mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, - sizeof(out)); - if (mr == NULL) { - ucs_fatal("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + sizeof(in), out, sizeof(out)); + if (memh->cross_mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); status = UCS_ERR_UNSUPPORTED; goto err_out; @@ -1218,6 +1268,8 @@ static uct_ib_md_ops_t uct_ib_mlx5_md_ops = { .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported, .mem_prefetch = uct_ib_mlx5_mem_prefetch, .get_atomic_mr_id = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported, + .reg_crossed_key = uct_ib_mlx5_devx_reg_crossed_key, + .reg_crossing_key = uct_ib_mlx5_devx_reg_crossing_key, }; UCT_IB_MD_OPS(uct_ib_mlx5_md_ops, 1); diff --git a/test/apps/test_uct_xgvmi.c b/test/apps/test_uct_xgvmi.c new file mode 100644 index 00000000000..dc64bf40d83 --- /dev/null +++ b/test/apps/test_uct_xgvmi.c @@ -0,0 +1,224 @@ +/** + * Copyright (C) 2022 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include +#include +#include +#include +#include +#include +#include + + +#define CHKERR_ACTION(_cond, _msg, _action) \ + do { \ + if (_cond) { \ + fprintf(stderr, "Failed to %s\n", _msg); \ + _action; \ + } \ + } while (0) + + +#define CHKERR_JUMP(_cond, _msg, _label) CHKERR_ACTION(_cond, _msg, goto _label) + +typedef struct { + char *server_name; + uint16_t server_port; + const char *md_name; + int gvmi_id; + size_t size; + size_t align; + uint32_t mkey; +} cmd_args_t; + + +/* Device and transport to be used are determined by minimum latency */ +static ucs_status_t open_md(const cmd_args_t *cmd_args, uct_md_h *md_p, + uct_component_h *component_p) +{ + uct_component_h *components; + unsigned num_components; + unsigned cmpt_index; + uct_component_attr_t component_attr; + unsigned md_index; + uct_md_config_t *md_config; + ucs_status_t status; + uct_md_h md; + + status = uct_query_components(&components, &num_components); + CHKERR_JUMP(UCS_OK != status, "query for components", error_ret); + + for (cmpt_index = 0; cmpt_index < num_components; ++cmpt_index) { + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT; + status = uct_component_query(components[cmpt_index], &component_attr); + CHKERR_JUMP(UCS_OK != status, "query component attributes", + release_component_list); + + component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + component_attr.md_resources = alloca(sizeof(*component_attr.md_resources) * + component_attr.md_resource_count); + status = uct_component_query(components[cmpt_index], &component_attr); + CHKERR_JUMP(UCS_OK != status, "query for memory domain resources", + release_component_list); + + /* Iterate through memory domain resources */ + for (md_index = 0; md_index < component_attr.md_resource_count; ++md_index) { + status = uct_md_config_read(components[cmpt_index], NULL, NULL, + &md_config); + CHKERR_JUMP(UCS_OK != status, "read MD config", + release_component_list); + + if (strcmp(component_attr.md_resources[md_index].md_name, + cmd_args->md_name)) { + continue; + } + + status = uct_md_open(components[cmpt_index], + component_attr.md_resources[md_index].md_name, + md_config, &md); + uct_config_release(md_config); + + CHKERR_JUMP(UCS_OK != status, "open memory domains", + release_component_list); + + *md_p = md; + *component_p = components[cmpt_index]; + return UCS_OK; + } + } + + status = UCS_ERR_NO_DEVICE; + +release_component_list: + uct_release_component_list(components); +error_ret: + return status; +} + +void do_export(uct_md_h md, uct_component_h component, + const cmd_args_t *cmd_args) +{ + uct_md_mem_reg_shared_params_t reg_shared_params; + uct_rkey_bundle_t rkey_bundle; + uint8_t rkey_buf[1024]; + ucs_status_t status; + uct_mem_h memh; + void *ptr; + int ret; + + ret = posix_memalign(&ptr, cmd_args->align, cmd_args->size); + CHKERR_JUMP(0 != ret, "allocate memory", error_ret); + + reg_shared_params.address = ptr; + reg_shared_params.length = cmd_args->size; + reg_shared_params.dest_gvmi = cmd_args->gvmi_id; + + status = uct_md_mem_reg_shared(md, ®_shared_params, &memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_mem_reg_shared", error_ret); + + status = uct_md_mkey_pack(md, memh, rkey_buf); + CHKERR_JUMP(UCS_OK != status, "uct_md_mkey_pack", error_ret); + + status = uct_rkey_unpack(component, rkey_buf, &rkey_bundle); + CHKERR_JUMP(UCS_OK != status, "uct_rkey_unpack", error_ret); + + printf("shared ptr %p len %zu rkey 0x%x towards gvmi %d\n", ptr, + cmd_args->size, (uint32_t)rkey_bundle.rkey, cmd_args->gvmi_id); + printf("press any key to continue\n"); + getchar(); + + status = uct_rkey_release(component, &rkey_bundle); + CHKERR_JUMP(UCS_OK != status, "uct_rkey_release", error_ret); + + status = uct_md_mem_dereg(md, memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_mem_dereg", error_ret); + + free(ptr); + +error_ret: + ; +} + +void do_import(uct_md_h md, uct_component_h component, + const cmd_args_t *cmd_args) +{ + uct_md_import_shared_rkey_params_t import_params; + ucs_status_t status; + uct_mem_h memh; + + printf("unpacking mkey 0x%x on gvmi %d\n", cmd_args->mkey, + cmd_args->gvmi_id); + + import_params.rkey = cmd_args->mkey; + import_params.source_gvmi = cmd_args->gvmi_id; // TODO + + status = uct_md_import_shared_rkey(md, &import_params, &memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_import_shared_rkey", error_ret); + + printf("imported shared rkey memh=%p\n", memh); + + status = uct_md_mem_dereg(md, memh); + CHKERR_JUMP(UCS_OK != status, "uct_md_mem_dereg", error_ret); + +error_ret: + ; +} + + +int main(int argc, char** argv) +{ + uct_component_h component; + ucs_status_t status; + uct_md_h md; + cmd_args_t args; + int c; + + args.md_name = "mlx5_0"; + args.gvmi_id = 0; + args.mkey = 0; + args.size = 1024 * 1024; + args.align = 65536; + + while ((c = getopt(argc, argv, "d:g:i:s:a:")) != -1) { + switch (c) { + case 'd': + args.md_name = optarg; + break; + case 'g': + args.gvmi_id = atoi(optarg); + break; + case 'i': + args.mkey = strtol(optarg, NULL, 0); + break; + case 's': + args.size = strtoll(optarg, NULL, 0); + break; + case 'a': + args.align = strtoll(optarg, NULL, 0); + break; + default: + printf("Usage: %s [-d ] [-g ] [-i ] [ -s size ] [ -a align ]\n", + argv[0]); + return -1; + } + } + + status = open_md(&args, &md, &component); + if (status != UCS_OK) { + printf("could not open md\n"); + return -2; + } + + if (args.mkey) { + do_import(md, component, &args); + } else { + do_export(md, component, &args); + } + + uct_md_close(md); + + return 0; +} \ No newline at end of file From e255fe548caa8a22978f3c65ef7a6dd1be17594e Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 11:39:19 +0200 Subject: [PATCH 17/27] TEST: test fixes (revert to make send-recv) --- test/apps/Makefile.am | 9 +-------- test/gtest/ucp/test_ucp_am.cc | 12 ++++++------ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am index e81283aeeda..1e787a8f3cf 100644 --- a/test/apps/Makefile.am +++ b/test/apps/Makefile.am @@ -21,8 +21,7 @@ noinst_PROGRAMS = \ test_link_map \ test_dlopen_cfg_print \ test_init_mt \ - test_memtrack_limit \ - test_uct_xgvmi + test_memtrack_limit objdir = $(shell sed -n -e 's/^objdir=\(.*\)$$/\1/p' $(LIBTOOL)) @@ -66,12 +65,6 @@ test_init_mt_CFLAGS = $(BASE_CFLAGS) $(OPENMP_CFLAGS) test_init_mt_LDADD = $(top_builddir)/src/ucp/libucp.la -test_uct_xgvmi_SOURCES = test_uct_xgvmi.c -test_uct_xgvmi_CPPFLAGS = $(BASE_CPPFLAGS) -test_uct_xgvmi_CFLAGS = $(BASE_CFLAGS) -test_uct_xgvmi_LDADD = $(top_builddir)/src/ucp/libuct.la - - if HAVE_CUDA noinst_PROGRAMS += test_cuda_hook_dynamic diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index cb2d037da1f..f9ca7080c83 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -41,8 +41,8 @@ class test_ucp_am_base : public ucp_test { modify_config("MAX_EAGER_LANES", "2"); ucp_test::init(); - // sender().connect(&receiver(), get_ep_params()); - // receiver().connect(&sender(), get_ep_params()); + sender().connect(&receiver(), get_ep_params()); + receiver().connect(&sender(), get_ep_params()); } protected: @@ -1314,7 +1314,7 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { ASSERT_UCS_OK(ucp_rkey_pack(context, memh, &rkey_buf, &rkey_buf_size)); - ASSERT_UCS_OK(ucp_worker_rkey_unpack(sender().worker(), rkey_buf, + ASSERT_UCS_OK(ucp_ep_rkey_unpack(sender().ep(), rkey_buf, &mparams.rkey)); ucp_rkey_buffer_release(rkey_buf); @@ -1412,9 +1412,11 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { ucp_request_param_t op_param; op_param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH | UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_USER_DATA | UCP_OP_ATTR_FLAG_NO_IMM_CMPL; op_param.memh = self->m_imp_memh; - op_param.cb.recv_am = am_data_recv_cb; + op_param.cb.recv_am = am_data_rndv_recv_cb; + op_param.user_data = self; ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(self->receiver().worker(), data, address, length, &op_param); @@ -1537,7 +1539,6 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, shared_mkey) void *address = alloc_memhs(sender().ucph(), length, &exp_memh, &imp_memh); ASSERT_TRUE(address != NULL); -#if 0 ucp_request_param_t param; param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH; param.memh = imp_memh; @@ -1546,7 +1547,6 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, shared_mkey) EXPECT_EQ(m_status, request_wait(sptr)); EXPECT_TRUE(m_am_received); -#endif free_memhs(sender().ucph(), exp_memh, imp_memh); } From ae6857675d20c16fc9c174eb54c6c014a6656f67 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 12:30:56 +0200 Subject: [PATCH 18/27] UCP: Add user memh prints --- src/ucp/core/ucp_request.inl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ucp/core/ucp_request.inl b/src/ucp/core/ucp_request.inl index 7a49c862acc..e9af38d1d02 100644 --- a/src/ucp/core/ucp_request.inl +++ b/src/ucp/core/ucp_request.inl @@ -650,6 +650,9 @@ ucp_request_is_user_memh_valid(ucp_request_t *req, return 0; } + ucs_print("set user memh %p, gvmi %d imported %d", + param->memh, param->memh->peer_id, param->memh->imported); + ucs_assert(param->memh != NULL); /* For Coverity */ return 1; } @@ -664,8 +667,10 @@ ucp_send_request_set_user_memh(ucp_request_t *req, ucp_md_map_t md_map, req->send.length, req->send.datatype, (ucs_memory_type_t)req->send.mem_type, &status)) { + ucs_print("set send user memh failed %s", ucs_status_string(status)); return status; } + ucs_print("set send user memh ok mdmap 0x%lx", md_map); /* req->send.state.dt should not be used with protov2 */ ucs_assert(!req->send.ep->worker->context->config.ext.proto_enable); @@ -687,6 +692,7 @@ ucp_recv_request_set_user_memh(ucp_request_t *req, req->recv.mem_type, &status)) { return status; } + ucs_print("set recv user memh ok"); ucs_assert(!(req->flags & UCP_REQUEST_FLAG_USER_MEMH)); req->flags |= UCP_REQUEST_FLAG_USER_MEMH; From 00886282fb695fa6aa9df0e24376ebc1621ca2f0 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 12:46:38 +0200 Subject: [PATCH 19/27] UCP: Fix import memh init --- src/ucp/core/ucp_mm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 8c0be23b423..076ff330490 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -562,6 +562,12 @@ ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, memh->md_map = md_map_registered; memh->imported = 1; + memh->mem_type = UCS_MEMORY_TYPE_HOST; + memh->super.super.start = (uintptr_t)address; + memh->super.super.end = (uintptr_t)address + length; + memh->alloc_md_index = UCP_NULL_RESOURCE; + memh->alloc_method = UCT_ALLOC_METHOD_LAST; + memh->peer_id = UCP_NULL_RESOURCE; *memh_p = memh; return UCS_OK; @@ -679,8 +685,8 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para ucs_status_t ucp_mem_unmap(ucp_context_h context, ucp_mem_h memh) { UCP_THREAD_CS_ENTER(&context->mt_lock); - ucs_print("unreg memh %p, map md0x%lx, imported %d", - memh, memh->md_map, memh->imported); + ucs_print("unreg memh %p, map md0x%lx, imported %d gvmi %d", + memh, memh->md_map, memh->imported, memh->peer_id); if (memh->imported) { ucp_memh_dereg(context, memh, memh->md_map); ucs_free(memh); From 1f272f4ec2adf09925a0734c7699ced5fcc2a88b Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Tue, 22 Feb 2022 11:23:39 +0000 Subject: [PATCH 20/27] WIP5: Cross gvmi alias key --- src/uct/ib/base/ib_md.c | 4 +- src/uct/ib/base/ib_md.h | 8 +- src/uct/ib/mlx5/dv/ib_mlx5_ifc.h | 135 +++++++++++-- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 303 ++++++++++++++++++++++++++---- src/uct/ib/mlx5/ib_mlx5.h | 4 +- 5 files changed, 393 insertions(+), 61 deletions(-) diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 4906d62dff9..565db6444b9 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -855,7 +855,7 @@ uct_ib_md_mem_reg_shared(uct_md_h uct_md, uct_md_mem_reg_shared_params_t *params ucs_status_t status; ib_memh = uct_ib_memh_alloc(md); - status = md->ops->reg_crossed_key(md, params->address, params->length, + status = md->ops->reg_shared_key(md, params->address, params->length, params->dest_gvmi, ib_memh); if (status != UCS_OK) { uct_ib_memh_free(ib_memh); @@ -877,7 +877,7 @@ uct_ib_md_import_shared_rkey(uct_md_h uct_md, ucs_status_t status; ib_memh = uct_ib_memh_alloc(md); - status = md->ops->reg_crossing_key(md, params->source_gvmi, + status = md->ops->import_shared_key(md, params->source_gvmi, uct_ib_md_direct_rkey(params->rkey), ib_memh); if (status != UCS_OK) { diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index 7242019e9bf..37aac38167c 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -339,7 +339,7 @@ typedef ucs_status_t (*uct_ib_md_get_atomic_mr_id_func_t)(uct_ib_md_t *md, * * @return UCS_OK on success or error code in case of failure. */ -typedef ucs_status_t (*uct_ib_md_reg_crossed_key_func_t)(uct_ib_md_t *ib_md, +typedef ucs_status_t (*uct_ib_md_reg_shared_key_func_t)(uct_ib_md_t *ib_md, void *address, size_t length, uint32_t allowed_gvmi_id, @@ -363,7 +363,7 @@ typedef ucs_status_t (*uct_ib_md_reg_crossed_key_func_t)(uct_ib_md_t *ib_md, * * @return UCS_OK on success or error code in case of failure. */ -typedef ucs_status_t (*uct_ib_md_reg_crossing_key_func_t)(uct_ib_md_t *ib_md, +typedef ucs_status_t (*uct_ib_md_import_shared_key_func_t)(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, uint32_t target_mkey, uct_ib_mem_t *ib_memh); @@ -379,8 +379,8 @@ typedef struct uct_ib_md_ops { uct_ib_md_dereg_multithreaded_func_t dereg_multithreaded; uct_ib_md_mem_prefetch_func_t mem_prefetch; uct_ib_md_get_atomic_mr_id_func_t get_atomic_mr_id; - uct_ib_md_reg_crossed_key_func_t reg_crossed_key; - uct_ib_md_reg_crossing_key_func_t reg_crossing_key; + uct_ib_md_reg_shared_key_func_t reg_shared_key; + uct_ib_md_import_shared_key_func_t import_shared_key; } uct_ib_md_ops_t; diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h index d290ee525dd..ac6144f8075 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h +++ b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h @@ -80,7 +80,8 @@ enum { UCT_IB_MLX5_CMD_OP_QUERY_LAG = 0x842, UCT_IB_MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00, UCT_IB_MLX5_CMD_OP_MODIFY_GENERAL_OBJECT = 0xa01, - UCT_IB_MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02 + UCT_IB_MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02, + UCT_IB_MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS = 0xb16 }; enum { @@ -96,7 +97,19 @@ enum { }; struct uct_ib_mlx5_cmd_hca_cap_bits { - uint8_t reserved_at_0[0x30]; + uint8_t access_other_hca_roce[0x1]; + uint8_t reserved_at_1[0x1e]; + uint8_t vhca_resource_manager[0x1]; + + uint8_t hca_cap_2[0x1]; + uint8_t create_lag_when_not_master_up[0x1]; + uint8_t dtor[0x1]; + uint8_t event_on_vhca_state_teardown_request[0x1]; + uint8_t event_on_vhca_state_in_use[0x1]; + uint8_t event_on_vhca_state_active[0x1]; + uint8_t event_on_vhca_state_allocated[0x1]; + uint8_t event_on_vhca_state_invalid[0x1]; + uint8_t transpose_max_element_size[0x8]; uint8_t vhca_id[0x10]; uint8_t reserved_at_40[0x40]; @@ -463,13 +476,40 @@ struct uct_ib_mlx5_atomic_caps_bits { struct uct_ib_mlx5_cmd_hca_cap_2_bits { uint8_t reserved_at_0[0x80]; - uint8_t reserved_at_80[0x13]; - /* Log (base 2) of the minimum bulk granularity of - allocated RESERVED_QPN objects */ + uint8_t reserved_at_80[0x3]; + uint8_t max_num_prog_sample_field[0x5]; + uint8_t reserved_at_88[0x3]; + uint8_t log_max_num_reserved_qpn[0x5]; + uint8_t atomic_rate_pa[0x1]; + uint8_t introspection_mkey_access_allowed[0x1]; + uint8_t reserved_at_92[0x1]; uint8_t log_reserved_qpn_granularity[0x5]; - uint8_t reserved_at_98[0x8]; + uint8_t reserved_at_98[0x3]; + uint8_t log_reserved_qpn_max_alloc[0x5]; - uint8_t reserved_at_a0[0x760]; + uint8_t max_reformat_insert_size[0x8]; + uint8_t max_reformat_insert_offset[0x8]; + uint8_t max_reformat_remove_size[0x8]; + uint8_t max_reformat_remove_offset[0x8]; + + uint8_t multi_sl_qp[0x1]; + uint8_t non_tunnel_reformat[0x1]; + uint8_t reserved_at_122[0x1]; + uint8_t log_min_stride_wqe_sz[0x5]; + uint8_t reserved_at_128[0x3]; + uint8_t log_conn_track_granularity[0x5]; + uint8_t reserved_at_130[0x3]; + uint8_t log_conn_track_max_alloc[0x5]; + uint8_t reserved_at_138[0x3]; + uint8_t log_max_conn_track_offload[0x5]; + + uint8_t cross_vhca_object_to_object_supported[0x20]; + + uint8_t allowed_object_for_other_vhca_access[0x40]; + + uint8_t introspection_mkey[0x20]; + + uint8_t reserved_at_220[0x6A0]; }; struct uct_ib_mlx5_odp_per_transport_service_cap_bits { @@ -1581,31 +1621,32 @@ enum { UCT_IB_MLX5_EVENT_TYPE_SRQ_LAST_WQE = 0x13 }; -struct uct_ib_mlx5_general_obj_out_cmd_hdr_bits { - uint8_t status[0x8]; - uint8_t reserved_at_8[0x18]; - - uint8_t syndrome[0x20]; - - uint8_t obj_id[0x20]; - - uint8_t reserved_at_60[0x20]; -}; - struct uct_ib_mlx5_general_obj_in_cmd_hdr_bits { uint8_t opcode[0x10]; uint8_t uid[0x10]; - uint8_t reserved_at_20[0x10]; + uint8_t vhca_tunnel_id[0x10]; uint8_t obj_type[0x10]; uint8_t obj_id[0x20]; - uint8_t reserved_at_60[0x3]; + uint8_t alias_object[0x1]; + uint8_t reserved_at_61[0x2]; uint8_t log_obj_range[0x5]; uint8_t reserved_at_68[0x18]; }; +struct uct_ib_mlx5_general_obj_out_cmd_hdr_bits { + uint8_t status[0x8]; + uint8_t reserved_at_8[0x18]; + + uint8_t syndrome[0x20]; + + uint8_t obj_id[0x20]; + + uint8_t reserved_at_60[0x20]; +}; + struct uct_ib_mlx5_reserved_qpn_bits { uint8_t reserved_at_0[0x80]; }; @@ -1617,6 +1658,60 @@ struct uct_ib_mlx5_create_reserved_qpn_in_bits { enum { UCT_IB_MLX5_OBJ_TYPE_RESERVED_QPN = 0x002C, + UCT_IB_MLX5_OBJ_TYPE_MKEY = 0xFF01, +}; + +struct uct_ib_mlx5_allow_other_vhca_access_in_bits { + uint8_t opcode[0x10]; + uint8_t uid[0x10]; + + uint8_t reserved_at_20[0x10]; + uint8_t op_mod[0x10]; + + uint8_t reserved_at_40[0x40]; + + uint8_t reserved_at_80[0x10]; + uint8_t object_type_to_be_accessed[0x10]; + + uint8_t object_id_to_be_accessed[0x20]; + + uint8_t reserved_at_a0[0x40]; + + uint8_t access_key[0x100]; +}; + +struct uct_ib_mlx5_allow_other_vhca_access_out_bits { + uint8_t status[0x8]; + uint8_t reserved_at_8[0x18]; + + uint8_t syndrome[0x20]; + + uint8_t reserved_at_40[0x40]; +}; + +struct uct_ib_mlx5_alias_context_bits { + uint8_t vhca_id_to_be_accessed[0x10]; + uint8_t reserved_at_10[0xd]; + uint8_t status[0x3]; + + uint8_t object_id_to_be_accessed[0x20]; + + uint8_t reserved_at_40[0x40]; + + uint8_t access_key[0x100]; + + uint8_t metadata_1[0x20]; + uint8_t metadata_2[0x60]; +}; + +struct uct_ib_mlx5_create_alias_obj_in_bits { + struct uct_ib_mlx5_general_obj_in_cmd_hdr_bits hdr; + struct uct_ib_mlx5_alias_context_bits alias_ctx; +}; + +struct uct_ib_mlx5_create_alias_obj_out_bits { + struct uct_ib_mlx5_general_obj_out_cmd_hdr_bits hdr; + struct uct_ib_mlx5_alias_context_bits alias_ctx; }; #endif diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 9a45f7103db..39eb316b0ff 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -28,8 +28,8 @@ typedef union uct_ib_mlx5_mr { typedef enum { UCT_IB_MLX5_MEM_REG, - UCT_IB_MLX5_MEM_CROSSED, - UCT_IB_MLX5_MEM_CROSSING + UCT_IB_MLX5_MEM_SHARED, + UCT_IB_MLX5_MEM_IMPORTED } uct_ib_mlx5_mem_type_t; typedef struct uct_ib_mlx5_mem { @@ -70,7 +70,7 @@ static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, switch (memh->type) { case UCT_IB_MLX5_MEM_REG: return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); - case UCT_IB_MLX5_MEM_CROSSED: + case UCT_IB_MLX5_MEM_SHARED: if (mr_type != UCT_IB_MR_DEFAULT) { return UCS_OK; } @@ -86,7 +86,7 @@ static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, return UCS_ERR_IO_ERROR; } return UCS_OK; - case UCT_IB_MLX5_MEM_CROSSING: + case UCT_IB_MLX5_MEM_IMPORTED: if (mr_type != UCT_IB_MR_DEFAULT) { return UCS_OK; } @@ -655,6 +655,47 @@ uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device) static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops; + +static int uct_ib_mlx5_is_xgvmi_alias_supported(struct ibv_context *ctx) +{ + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out)] = {}; + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)] = {}; + uint64_t object_for_other_vhca; + uint32_t object_to_object; + void *cap; + int ret; + + cap = UCT_IB_MLX5DV_ADDR_OF(query_hca_cap_out, out, capability); + + /* query HCA CAP 2 */ + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, opcode, + UCT_IB_MLX5_CMD_OP_QUERY_HCA_CAP); + UCT_IB_MLX5DV_SET(query_hca_cap_in, in, op_mod, + UCT_IB_MLX5_HCA_CAP_OPMOD_GET_CUR | + (UCT_IB_MLX5_CAP_2_GENERAL << 1)); + ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (ret != 0) { + if ((errno == EPERM) || (errno == EPROTONOSUPPORT) || + (errno == EOPNOTSUPP)) { + ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); + } else { + ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); + } + return 0; + } + + object_to_object = UCT_IB_MLX5DV_GET(cmd_hca_cap_2, cap, + cross_vhca_object_to_object_supported); + object_for_other_vhca = UCT_IB_MLX5DV_GET64( + cmd_hca_cap_2, cap, allowed_object_for_other_vhca_access); + + ucs_info("object_to_object=0x%x object_for_other_vhca=0x%lx", + object_to_object, object_for_other_vhca); + + return (object_to_object & UCS_BIT(8) /* Mkey */) && + (object_for_other_vhca & UCS_BIT(2) /* Mkey */); +} + static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, const uct_ib_md_config_t *md_config, uct_ib_md_t **p_md) @@ -790,20 +831,7 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, } md->super.vhca_id = UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id); - // ucs_warn("%s: vhca_id is %d. at b4h: 0x%x; 0x%x", uct_ib_device_name(dev), - // UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, vhca_id), - // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)), - // ntohl(*(uint32_t*)UCS_PTR_BYTE_OFFSET(cap, 0xB4)) & (1<<13) - // ); - if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, - crossing_vhca_mkey)) { - ucs_info("%s: vhca_id=%d crossing_vhca_mkey is supported", - uct_ib_device_name(dev), md->super.vhca_id); - } else { - ucs_info("%s: vhca_id=%d crossing_vhca_mkey is not supported", - uct_ib_device_name(dev), md->super.vhca_id); - } md->super.extra_cap_flags |= UCT_MD_FLAG_SHARED_RKEY; status = uct_ib_mlx5_devx_check_odp(md, md_config, cap); @@ -859,6 +887,19 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device, md->super.ops = &uct_ib_mlx5_devx_md_ops; + if (uct_ib_mlx5_is_xgvmi_alias_supported(ctx)) { + md->flags |= UCT_IB_MLX5_MD_FLAG_CROSS_GVMI_ALIAS; + ucs_info("%s: vhca_id=%d cross gvmi alias mkey is supported", + uct_ib_device_name(dev), md->super.vhca_id); + } else if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, + crossing_vhca_mkey)) { + ucs_info("%s: vhca_id=%d crossing_vhca_mkey is supported", + uct_ib_device_name(dev), md->super.vhca_id); + } else { + ucs_info("%s: vhca_id=%d crossing_vhca_mkey is not supported", + uct_ib_device_name(dev), md->super.vhca_id); + } + uct_ib_mlx5_parse_relaxed_order(md, md_config); status = uct_ib_md_open_common(&md->super, ibv_device, md_config); if (status != UCS_OK) { @@ -907,11 +948,15 @@ static void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd) } #define UCT_IB_CROSS_KEY_IDX 0xcc +#define UCT_IB_UMEM_ACCESS \ + (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE) +static const char *uct_ib_mkey_token = "SW Hackaton 2022"; +#define UCT_IB_TOKEN_SIZE (0x100 / 8) static ucs_status_t -uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, - size_t length, uint32_t allowed_gvmi_id, - uct_ib_mem_t *ib_memh) +uct_ib_mlx5_devx_reg_shared_key_old(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh) { uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); @@ -920,15 +965,11 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, void *mkc; ucs_status_t status; - ucs_print("reg key %p crossed", memh); - - memh->type = UCT_IB_MLX5_MEM_CROSSED; + memh->type = UCT_IB_MLX5_MEM_SHARED; ucs_print("umr_reg crosses address=%p length=%zu", address, length); - memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_READ | - IBV_ACCESS_REMOTE_WRITE); + memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, + length, UCT_IB_UMEM_ACCESS); if (memh->umem == NULL) { ucs_error("mlx5dv_devx_umem_reg() failed: %m"); status = UCS_ERR_NO_MEMORY; @@ -983,8 +1024,10 @@ uct_ib_mlx5_devx_reg_crossed_key(uct_ib_md_t *ib_md, void *address, } static ucs_status_t -uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, - uint32_t target_mkey, uct_ib_mem_t *ib_memh) +uct_ib_mlx5_devx_import_shared_key_old(uct_ib_md_t *ib_md, + uint32_t target_gvmi_id, + uint32_t target_mkey, + uct_ib_mem_t *ib_memh) { uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); @@ -996,7 +1039,7 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, ucs_status_t status; ucs_print("reg key %p crossing", memh); - memh->type = UCT_IB_MLX5_MEM_CROSSING; + memh->type = UCT_IB_MLX5_MEM_IMPORTED; dv.pd.in = md->super.pd; dv.pd.out = &dvpd; @@ -1047,6 +1090,198 @@ uct_ib_mlx5_devx_reg_crossing_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, return status; } +static ucs_status_t +uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in)] = {0}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_out)] = {0}; + struct mlx5dv_pd dvpd = {0}; + struct mlx5dv_obj dv = {{0}}; + char ein[UCT_IB_MLX5DV_ST_SZ_BYTES(allow_other_vhca_access_in)] = {0}; + char eout[UCT_IB_MLX5DV_ST_SZ_BYTES(allow_other_vhca_access_out)] = {0}; + ucs_status_t status; + void *access_key; + void *mkc; + int rc; + + memh->type = UCT_IB_MLX5_MEM_SHARED; + + /* register umem */ + ucs_print("umr_reg crosses address=%p length=%zu", address, length); + memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, + length, UCT_IB_UMEM_ACCESS); + if (memh->umem == NULL) { + ucs_error("mlx5dv_devx_umem_reg() failed: %m"); + status = UCS_ERR_NO_MEMORY; + goto err_out; + } + + /* obtain pdn */ + dv.pd.in = md->super.pd; + dv.pd.out = &dvpd; + rc = mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + if (rc) { + ucs_error("mlx5dv_init_obj() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_free; + } + + /* create mkey */ + mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); + UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, pg_access, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_valid, 1); + UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, memh->umem->umem_id); + UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_MTT); + UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, crossing_target_mkey, 1); + UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, 12); + UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); + UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); + UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, UCT_IB_CROSS_KEY_IDX); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (uintptr_t)address); + UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); + + memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + sizeof(in), out, sizeof(out)); + if (memh->cross_mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_mkey_out, out, syndrome)); + status = UCS_ERR_IO_ERROR; + goto err_free; + } + + memh->super.lkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) + << 8) | + UCT_IB_CROSS_KEY_IDX; + memh->super.rkey = memh->super.lkey; + + ucs_print("allowing access by token '%s'", uct_ib_mkey_token); + + UCT_IB_MLX5DV_SET(allow_other_vhca_access_in, ein, opcode, + UCT_IB_MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS); + UCT_IB_MLX5DV_SET(allow_other_vhca_access_in, ein, + object_type_to_be_accessed, UCT_IB_MLX5_OBJ_TYPE_MKEY); + UCT_IB_MLX5DV_SET(allow_other_vhca_access_in, ein, object_id_to_be_accessed, + memh->super.lkey >> 8); + access_key = UCT_IB_MLX5DV_ADDR_OF(allow_other_vhca_access_in, ein, access_key); + ucs_strncpy_zero(access_key, uct_ib_mkey_token, UCT_IB_TOKEN_SIZE); + + rc = mlx5dv_devx_general_cmd(md->super.dev.ibv_context, ein, sizeof(ein), + eout, sizeof(eout)); + if (rc) { + ucs_error("mlx5dv_devx_general_cmd() failed: %m"); + status = UCS_ERR_IO_ERROR; + goto err_destroy; + } + + return UCS_OK; + +err_destroy: + mlx5dv_devx_obj_destroy(memh->cross_mr); +err_free: + mlx5dv_devx_umem_dereg(memh->umem); +err_out: + return status; +} + +static ucs_status_t +uct_ib_mlx5_devx_import_shared_key_alias(uct_ib_md_t *ib_md, + uint32_t target_gvmi_id, + uint32_t target_mkey, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); + char in[UCT_IB_MLX5DV_ST_SZ_BYTES(create_alias_obj_in)] = {0}; + char out[UCT_IB_MLX5DV_ST_SZ_BYTES(create_alias_obj_out)] = {0}; + struct mlx5dv_pd dvpd = {0}; + struct mlx5dv_obj dv = {{0}}; + void *hdr = UCT_IB_MLX5DV_ADDR_OF(create_alias_obj_in, in, hdr); + void *alias_ctx = UCT_IB_MLX5DV_ADDR_OF(create_alias_obj_in, in, alias_ctx); + ucs_status_t status; + void *access_key; + int rc; + + /* obtain pdn */ + dv.pd.in = md->super.pd; + dv.pd.out = &dvpd; + rc = mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); + if (rc) { + ucs_error("mlx5dv_init_obj() failed: %m"); + return UCS_ERR_IO_ERROR; + } + + /* create alias */ + UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, opcode, + UCT_IB_MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, obj_type, UCT_IB_MLX5_OBJ_TYPE_MKEY); + UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, alias_object, 1); + UCT_IB_MLX5DV_SET(alias_context, alias_ctx, vhca_id_to_be_accessed, + target_gvmi_id); + UCT_IB_MLX5DV_SET(alias_context, alias_ctx, object_id_to_be_accessed, + target_mkey >> 8); + UCT_IB_MLX5DV_SET(alias_context, alias_ctx, metadata_1, dvpd.pdn); + access_key = UCT_IB_MLX5DV_ADDR_OF(alias_context, alias_ctx, access_key); + ucs_strncpy_zero(access_key, uct_ib_mkey_token, UCT_IB_TOKEN_SIZE); + + memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, + sizeof(in), out, sizeof(out)); + if (memh->cross_mr == NULL) { + ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + UCT_IB_MLX5DV_GET(create_alias_obj_out, out, hdr.syndrome)); + status = UCT_IB_MLX5DV_GET(create_alias_obj_out, out, alias_ctx.status); + return status; + } + + memh->super.lkey = (UCT_IB_MLX5DV_GET(create_alias_obj_out, out, hdr.obj_id) + << 8) | + UCT_IB_CROSS_KEY_IDX; + memh->super.rkey = memh->super.lkey; + return UCS_OK; +} + +static ucs_status_t +uct_ib_mlx5_devx_reg_shared_key(uct_ib_md_t *ib_md, void *address, + size_t length, uint32_t allowed_gvmi_id, + uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + + return (md->flags & UCT_IB_MLX5_MD_FLAG_CROSS_GVMI_ALIAS) ? + uct_ib_mlx5_devx_reg_shared_key_alias(ib_md, address, length, + allowed_gvmi_id, + ib_memh) : + uct_ib_mlx5_devx_reg_shared_key_old(ib_md, address, length, + allowed_gvmi_id, + ib_memh); +} + +static ucs_status_t +uct_ib_mlx5_devx_import_shared_key(uct_ib_md_t *ib_md, uint32_t target_gvmi_id, + uint32_t target_mkey, uct_ib_mem_t *ib_memh) +{ + uct_ib_mlx5_md_t *md = ucs_derived_of(ib_md, uct_ib_mlx5_md_t); + + return (md->flags & UCT_IB_MLX5_MD_FLAG_CROSS_GVMI_ALIAS) ? + uct_ib_mlx5_devx_import_shared_key_alias(ib_md, + target_gvmi_id, + target_mkey, + ib_memh) : + uct_ib_mlx5_devx_import_shared_key_old(ib_md, target_gvmi_id, + target_mkey, ib_memh); +} + static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .open = uct_ib_mlx5_devx_md_open, .cleanup = uct_ib_mlx5_devx_md_cleanup, @@ -1058,8 +1293,8 @@ static uct_ib_md_ops_t uct_ib_mlx5_devx_md_ops = { .dereg_multithreaded = uct_ib_mlx5_devx_dereg_multithreaded, .mem_prefetch = uct_ib_mlx5_mem_prefetch, .get_atomic_mr_id = uct_ib_mlx5_md_get_atomic_mr_id, - .reg_crossed_key = uct_ib_mlx5_devx_reg_crossed_key, - .reg_crossing_key = uct_ib_mlx5_devx_reg_crossing_key, + .reg_shared_key = uct_ib_mlx5_devx_reg_shared_key, + .import_shared_key = uct_ib_mlx5_devx_import_shared_key, }; UCT_IB_MD_OPS(uct_ib_mlx5_devx_md_ops, 2); @@ -1270,8 +1505,8 @@ static uct_ib_md_ops_t uct_ib_mlx5_md_ops = { .dereg_multithreaded = (uct_ib_md_dereg_multithreaded_func_t)ucs_empty_function_return_unsupported, .mem_prefetch = uct_ib_mlx5_mem_prefetch, .get_atomic_mr_id = (uct_ib_md_get_atomic_mr_id_func_t)ucs_empty_function_return_unsupported, - .reg_crossed_key = uct_ib_mlx5_devx_reg_crossed_key, - .reg_crossing_key = uct_ib_mlx5_devx_reg_crossing_key, + .reg_shared_key = uct_ib_mlx5_devx_reg_shared_key, + .import_shared_key = uct_ib_mlx5_devx_import_shared_key, }; UCT_IB_MD_OPS(uct_ib_mlx5_md_ops, 1); diff --git a/src/uct/ib/mlx5/ib_mlx5.h b/src/uct/ib/mlx5/ib_mlx5.h index 0cb4cb95c73..313136e850e 100644 --- a/src/uct/ib/mlx5/ib_mlx5.h +++ b/src/uct/ib/mlx5/ib_mlx5.h @@ -181,9 +181,11 @@ enum { UCT_IB_MLX5_MD_FLAG_CQE_V1 = UCS_BIT(8), /* Device supports first fragment indication for MP XRQ */ UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG = UCS_BIT(9), + /* Device supports cross-gvmi alias */ + UCT_IB_MLX5_MD_FLAG_CROSS_GVMI_ALIAS = UCS_BIT(10), /* Object to be created by DevX */ - UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT = 10, + UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT = 11, UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCQP), UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCSRQ), UCT_IB_MLX5_MD_FLAG_DEVX_DCT = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCT), From 6985ccf0933a8d029238ea6121c250cc98f2ff25 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Tue, 22 Feb 2022 13:33:15 +0200 Subject: [PATCH 21/27] WIP6: Fix error status --- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index 39eb316b0ff..addfd295ac5 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -689,8 +689,8 @@ static int uct_ib_mlx5_is_xgvmi_alias_supported(struct ibv_context *ctx) object_for_other_vhca = UCT_IB_MLX5DV_GET64( cmd_hca_cap_2, cap, allowed_object_for_other_vhca_access); - ucs_info("object_to_object=0x%x object_for_other_vhca=0x%lx", - object_to_object, object_for_other_vhca); + ucs_print("object_to_object=0x%x object_for_other_vhca=0x%lx", + object_to_object, object_for_other_vhca); return (object_to_object & UCS_BIT(8) /* Mkey */) && (object_for_other_vhca & UCS_BIT(2) /* Mkey */); @@ -1209,7 +1209,6 @@ uct_ib_mlx5_devx_import_shared_key_alias(uct_ib_md_t *ib_md, struct mlx5dv_obj dv = {{0}}; void *hdr = UCT_IB_MLX5DV_ADDR_OF(create_alias_obj_in, in, hdr); void *alias_ctx = UCT_IB_MLX5DV_ADDR_OF(create_alias_obj_in, in, alias_ctx); - ucs_status_t status; void *access_key; int rc; @@ -1240,8 +1239,7 @@ uct_ib_mlx5_devx_import_shared_key_alias(uct_ib_md_t *ib_md, if (memh->cross_mr == NULL) { ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", UCT_IB_MLX5DV_GET(create_alias_obj_out, out, hdr.syndrome)); - status = UCT_IB_MLX5DV_GET(create_alias_obj_out, out, alias_ctx.status); - return status; + return UCS_ERR_IO_ERROR; } memh->super.lkey = (UCT_IB_MLX5DV_GET(create_alias_obj_out, out, hdr.obj_id) From 9a2e2bc2af5ff3a5a118b8833e3857c9a246abde Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 15:14:12 +0200 Subject: [PATCH 22/27] UCT/UCP: Add SHARED rkey to MEM MD flags --- src/ucp/core/ucp_mm.c | 8 ++++---- src/uct/api/uct.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 076ff330490..b8f83e11935 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -274,7 +274,7 @@ ucp_mem_map_params2uct_flags(const ucp_mem_map_params_t *params) } if (params->flags & UCP_MEM_MAP_SHARED) { - flags |= UCT_MD_FLAG_SHARED_RKEY; + flags |= UCT_MD_MEM_FLAG_SHARED_RKEY; } } @@ -316,7 +316,7 @@ static void ucp_memh_cleanup(ucp_context_h context, ucp_mem_h memh, size_t length, ucp_md_index_t md_index, unsigned uct_flags, ucs_status_t status) { - int shared_memh = uct_flags & UCT_MD_FLAG_SHARED_RKEY; + int shared_memh = uct_flags & UCT_MD_MEM_FLAG_SHARED_RKEY; ucs_log_level_t log_level; log_level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? @@ -350,7 +350,7 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, ucs_for_each_bit(md_index, md_map) { md_attr = &context->tl_mds[md_index].attr; - if (uct_flags & UCT_MD_FLAG_SHARED_RKEY) { + if (uct_flags & UCT_MD_MEM_FLAG_SHARED_RKEY) { ucs_assert_always(memh->peer_id != UCP_NULL_RESOURCE); if (!(md_attr->cap.flags & UCT_MD_FLAG_SHARED_RKEY)) { @@ -385,7 +385,7 @@ static ucs_status_t ucp_memh_register(ucp_context_h context, ucp_mem_h memh, } ucs_print("Registered %s memh, mdmap 0x%lx (cur 0x%lx) gvmi %d", - ((uct_flags & UCT_MD_FLAG_SHARED_RKEY) ? "shared":""), + ((uct_flags & UCT_MD_MEM_FLAG_SHARED_RKEY) ? "shared":""), md_map_registered,memh->md_map, memh->peer_id); memh->md_map |= md_map_registered; diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 509f0bba3cd..f10d2ab94a9 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -716,6 +716,7 @@ enum uct_md_mem_flags { In some cases registration failure is not an error (e. g. for merged memory regions). */ + UCT_MD_MEM_FLAG_SHARED_RKEY = UCS_BIT(4), /**< MD supports shared remote keys */ /* memory access flags */ UCT_MD_MEM_ACCESS_REMOTE_PUT = UCS_BIT(5), /**< enable remote put access */ From 2bfce29b693a9527b15fb6fbeae5d8f88eb8bb17 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 15:58:47 +0200 Subject: [PATCH 23/27] UCP: Fix mem_map for non-shared mem --- src/ucp/core/ucp_mm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index b8f83e11935..5f622c7d276 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -594,6 +594,7 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para unsigned flags; void *address; ucp_rkey_h rkey; + ucp_rsc_index_t peer_id; /* always acquire context lock */ UCP_THREAD_CS_ENTER(&context->mt_lock); @@ -608,6 +609,7 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para address = UCP_PARAM_VALUE(MEM_MAP, params, address, ADDRESS, NULL); flags = UCP_PARAM_VALUE(MEM_MAP, params, flags, FLAGS, 0); rkey = UCP_PARAM_VALUE(MEM_MAP, params, rkey, RKEY, NULL); + peer_id = UCP_PARAM_VALUE(MEM_MAP, params, peer_id, PEER_ID, UCP_NULL_RESOURCE); if ((flags & UCP_MEM_MAP_FIXED) && ((uintptr_t)address % ucs_get_page_size())) { @@ -669,12 +671,12 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para if (ucp_mem_map_is_allocate(params)) { status = ucp_memh_alloc(context, address, params->length, memory_type, ucp_mem_map_params2uct_flags(params), - "user memory", rkey, params->peer_id, memh_p); + "user memory", rkey, peer_id, memh_p); } else { status = ucp_memh_get(context, address, params->length, memory_type, context->reg_md_map[memory_type], ucp_mem_map_params2uct_flags(params), - rkey, params->peer_id, memh_p); + rkey, peer_id, memh_p); } out: From b4877e00c273f505b7bfecb32a632f763c3357ce Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Tue, 22 Feb 2022 15:49:30 +0200 Subject: [PATCH 24/27] WIP7: Limit page size in umem --- src/uct/ib/mlx5/dv/ib_mlx5dv_md.c | 49 ++++++++++++++++++++----------- test/gtest/ucp/test_ucp_am.cc | 5 +++- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c index addfd295ac5..c1c65c6bbdd 100644 --- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c +++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c @@ -54,7 +54,7 @@ static ucs_status_t uct_ib_mlx5_reg_key(uct_ib_md_t *md, void *address, memh->type = UCT_IB_MLX5_MEM_REG; - ucs_print("reg key %p type %d", memh, mr_type); + // ucs_print("reg key %p type %d", memh, mr_type); return uct_ib_reg_key_impl(md, address, length, access_flags, ib_memh, &memh->mrs[mr_type].super, mr_type, silent); } @@ -66,7 +66,7 @@ static ucs_status_t uct_ib_mlx5_dereg_key(uct_ib_md_t *md, uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); int ret; - ucs_print("de-reg key %p type %d", memh, mr_type); + // ucs_print("de-reg key %p type %d", memh, mr_type); switch (memh->type) { case UCT_IB_MLX5_MEM_REG: return uct_ib_dereg_mr(memh->mrs[mr_type].super.ib); @@ -677,9 +677,9 @@ static int uct_ib_mlx5_is_xgvmi_alias_supported(struct ibv_context *ctx) if (ret != 0) { if ((errno == EPERM) || (errno == EPROTONOSUPPORT) || (errno == EOPNOTSUPP)) { - ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); + ucs_info("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); } else { - ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); + ucs_info("mlx5dv_devx_general_cmd(QUERY_HCA_CAP) failed: %m"); } return 0; } @@ -689,8 +689,8 @@ static int uct_ib_mlx5_is_xgvmi_alias_supported(struct ibv_context *ctx) object_for_other_vhca = UCT_IB_MLX5DV_GET64( cmd_hca_cap_2, cap, allowed_object_for_other_vhca_access); - ucs_print("object_to_object=0x%x object_for_other_vhca=0x%lx", - object_to_object, object_for_other_vhca); + // ucs_print("object_to_object=0x%x object_for_other_vhca=0x%lx", + // object_to_object, object_for_other_vhca); return (object_to_object & UCS_BIT(8) /* Mkey */) && (object_for_other_vhca & UCS_BIT(2) /* Mkey */); @@ -967,7 +967,7 @@ uct_ib_mlx5_devx_reg_shared_key_old(uct_ib_md_t *ib_md, void *address, memh->type = UCT_IB_MLX5_MEM_SHARED; - ucs_print("umr_reg crosses address=%p length=%zu", address, length); + // ucs_print("umr_reg crosses address=%p length=%zu", address, length); memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, length, UCT_IB_UMEM_ACCESS); if (memh->umem == NULL) { @@ -1012,7 +1012,7 @@ uct_ib_mlx5_devx_reg_shared_key_old(uct_ib_md_t *ib_md, void *address, UCT_IB_CROSS_KEY_IDX; memh->super.rkey = memh->super.lkey; - ucs_print("crossed mkey is %x", memh->super.lkey); + // ucs_print("crossed mkey is %x", memh->super.lkey); status = UCS_OK; @@ -1082,7 +1082,7 @@ uct_ib_mlx5_devx_import_shared_key_old(uct_ib_md_t *ib_md, UCT_IB_CROSS_KEY_IDX; memh->super.rkey = memh->super.lkey; - ucs_print("crossing mkey is %x", memh->super.lkey); + ucs_print("imported shared mkey %x", memh->super.lkey); status = UCS_OK; @@ -1104,6 +1104,7 @@ uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, struct mlx5dv_obj dv = {{0}}; char ein[UCT_IB_MLX5DV_ST_SZ_BYTES(allow_other_vhca_access_in)] = {0}; char eout[UCT_IB_MLX5DV_ST_SZ_BYTES(allow_other_vhca_access_out)] = {0}; + struct mlx5dv_devx_umem_in umem_in; ucs_status_t status; void *access_key; void *mkc; @@ -1112,9 +1113,15 @@ uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, memh->type = UCT_IB_MLX5_MEM_SHARED; /* register umem */ - ucs_print("umr_reg crosses address=%p length=%zu", address, length); - memh->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, address, - length, UCT_IB_UMEM_ACCESS); + ucs_print("uct_ib_mlx5_devx_reg_shared_key_alias(%s) address=%p length=%zu", + uct_ib_device_name(&md->super.dev), address, length); + + umem_in.addr = address; + umem_in.size = length; + umem_in.access = UCT_IB_UMEM_ACCESS; + umem_in.pgsz_bitmap = UCS_BIT(12); + umem_in.comp_mask = 0; + memh->umem = mlx5dv_devx_umem_reg_ex(md->super.dev.ibv_context, &umem_in); if (memh->umem == NULL) { ucs_error("mlx5dv_devx_umem_reg() failed: %m"); status = UCS_ERR_NO_MEMORY; @@ -1138,6 +1145,7 @@ uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET(create_mkey_in, in, pg_access, 1); UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_valid, 1); UCT_IB_MLX5DV_SET(create_mkey_in, in, mkey_umem_id, memh->umem->umem_id); + UCT_IB_MLX5DV_SET64(create_mkey_in, in, mkey_umem_offset, 0); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_MTT); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); @@ -1149,7 +1157,7 @@ uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, UCT_IB_CROSS_KEY_IDX); - UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (uintptr_t)address); + UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, (intptr_t)address); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, @@ -1166,8 +1174,6 @@ uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, UCT_IB_CROSS_KEY_IDX; memh->super.rkey = memh->super.lkey; - ucs_print("allowing access by token '%s'", uct_ib_mkey_token); - UCT_IB_MLX5DV_SET(allow_other_vhca_access_in, ein, opcode, UCT_IB_MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS); UCT_IB_MLX5DV_SET(allow_other_vhca_access_in, ein, @@ -1185,6 +1191,9 @@ uct_ib_mlx5_devx_reg_shared_key_alias(uct_ib_md_t *ib_md, void *address, goto err_destroy; } + ucs_print("allowed access to key 0x%x gvmi %d by token '%s'", + memh->super.lkey, md->super.vhca_id, uct_ib_mkey_token); + return UCS_OK; err_destroy: @@ -1224,7 +1233,8 @@ uct_ib_mlx5_devx_import_shared_key_alias(uct_ib_md_t *ib_md, /* create alias */ UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, opcode, UCT_IB_MLX5_CMD_OP_CREATE_GENERAL_OBJECT); - UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, obj_type, UCT_IB_MLX5_OBJ_TYPE_MKEY); + UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, obj_type, + UCT_IB_MLX5_OBJ_TYPE_MKEY); UCT_IB_MLX5DV_SET(general_obj_in_cmd_hdr, hdr, alias_object, 1); UCT_IB_MLX5DV_SET(alias_context, alias_ctx, vhca_id_to_be_accessed, target_gvmi_id); @@ -1237,7 +1247,9 @@ uct_ib_mlx5_devx_import_shared_key_alias(uct_ib_md_t *ib_md, memh->cross_mr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, sizeof(in), out, sizeof(out)); if (memh->cross_mr == NULL) { - ucs_error("mlx5dv_devx_obj_create() failed, syndrome %x: %m", + ucs_error("mlx5dv_devx_obj_create(shared_key_alias, tg_mkey=0x%x " + "tg_gvmi=%d) failed, syndrome %x: %m", + target_mkey, target_gvmi_id, UCT_IB_MLX5DV_GET(create_alias_obj_out, out, hdr.syndrome)); return UCS_ERR_IO_ERROR; } @@ -1246,6 +1258,9 @@ uct_ib_mlx5_devx_import_shared_key_alias(uct_ib_md_t *ib_md, << 8) | UCT_IB_CROSS_KEY_IDX; memh->super.rkey = memh->super.lkey; + + ucs_print("imported shared mkey %x", memh->super.lkey); + return UCS_OK; } diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index f9ca7080c83..25239548a69 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -1293,7 +1293,10 @@ class test_ucp_am_nbx_rndv : public test_ucp_am_nbx { UCP_MEM_MAP_PARAM_FIELD_FLAGS; mparams.address = NULL; mparams.length = length; - mparams.peer_id = 70; + mparams.peer_id = 0; + if (getenv("GVMI") != NULL) { + mparams.peer_id = atoi(getenv("GVMI")); + } mparams.flags = UCP_MEM_MAP_ALLOCATE | UCP_MEM_MAP_SHARED; ASSERT_UCS_OK(ucp_mem_map(sender().ucph(), &mparams, &memh)); From 475e6aa45356d016a9dd65ce966cc4973e0183df Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 16:32:25 +0200 Subject: [PATCH 25/27] UCP: Fix memh init for memmap reg --- src/ucp/core/ucp_mm.c | 1 + src/ucp/core/ucp_mm.inl | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 5f622c7d276..a387d9c5c73 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -438,6 +438,7 @@ ucp_memh_get_slow(ucp_context_h context, void *address, size_t length, memh->mem_type = mem_type; memh->peer_id = peer_id; + memh->imported = 0; status = ucp_memh_register(context, memh, ~memh->md_map & reg_md_map, reg_address, reg_length, diff --git a/src/ucp/core/ucp_mm.inl b/src/ucp/core/ucp_mm.inl index 36c1f7a5134..0c4443b1ca9 100644 --- a/src/ucp/core/ucp_mm.inl +++ b/src/ucp/core/ucp_mm.inl @@ -38,8 +38,6 @@ ucp_memh_get(ucp_context_h context, void *address, size_t length, return status; } return UCS_OK; - } else { - (*memh_p)->imported = 0; } if (ucs_likely(context->rcache != NULL)) { From c3c646087f3eeb84f22e001f66a52b5c4b4e7bce Mon Sep 17 00:00:00 2001 From: dmitrygx Date: Tue, 22 Feb 2022 16:27:08 +0200 Subject: [PATCH 26/27] UCP/CORE: Don't dereg on MD which was used for allocation --- src/ucp/core/ucp_mm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index a387d9c5c73..84821cb19bb 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -298,6 +298,11 @@ void ucp_memh_dereg(ucp_context_h context, ucp_mem_h memh, ucp_md_map_t md_map) /* Unregister from all memory domains */ ucs_for_each_bit(md_index, md_map) { ucs_trace("de-registering memh[%d]=%p", md_index, memh->uct[md_index]); + + if (memh->alloc_md_index == md_index) { + continue; + } + ucs_assert(context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_REG); status = uct_md_mem_dereg(context->tl_mds[md_index].md, memh->uct[md_index]); From 7eb1dce9d8d9967559de5da6d292c18046e3a5b7 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 22 Feb 2022 19:02:49 +0200 Subject: [PATCH 27/27] Fix md_index for import --- src/ucp/core/ucp_mm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 84821cb19bb..7d4daf352f7 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -527,6 +527,7 @@ ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, uct_md_attr_t *md_attr; ucp_md_index_t md_index; ucs_status_t status; + int rkey_index; ucs_assert_always(rkey->peer_id != UCP_NULL_RESOURCE); @@ -540,8 +541,11 @@ ucs_status_t ucp_memh_import(ucp_context_h context, ucp_rkey_h rkey, ucs_for_each_bit(md_index, rkey->md_map) { md_attr = &context->tl_mds[md_index].attr; + // TODO map remote md_index to local md index + rkey_index = ucs_bitmap2idx(rkey->md_map, md_index); + ucs_assert_always(md_attr->cap.flags & UCT_MD_FLAG_SHARED_RKEY); - import_params.rkey = rkey->tl_rkey[md_index].rkey.rkey; + import_params.rkey = rkey->tl_rkey[rkey_index].rkey.rkey; import_params.source_gvmi = rkey->peer_id; ucs_print("registering address %p length %zu on md[%d]=%s gvmi %d rkey %lx",