66 * reserved.
77 * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
88 * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
9- * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
9+ * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights
1010 * reserved.
1111 * Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
1212 * $COPYRIGHT$
4242extern opal_accelerator_base_module_t opal_accelerator ;
4343opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL ,
4444 .prov_exclude = NULL ,
45- .output = -1 };
45+ .output = -1 ,
46+ .fabric = NULL ,
47+ .domain = NULL ,
48+ .fabric_ref_count = 0 ,
49+ .domain_ref_count = 0 };
4650static const char default_prov_exclude_list [] = "shm,sockets,tcp,udp,rstream,usnic,net" ;
4751static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT ;
4852static int opal_common_ofi_verbose_level = 0 ;
@@ -1257,3 +1261,175 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
12571261 }
12581262 return ret ;
12591263}
1264+
1265+ /**
1266+ * Get or create fabric object
1267+ *
1268+ * Reuses existing fabric from fabric_attr->fabric if available,
1269+ * otherwise creates new fabric using fi_fabric().
1270+ *
1271+ * @param fabric_attr (IN) Fabric attributes
1272+ * @param fabric (OUT) Fabric object (new or existing)
1273+ * @param context (IN) Optional context
1274+ *
1275+ * @return OPAL_SUCCESS or OPAL error code
1276+ */
1277+ int opal_common_ofi_fi_fabric (struct fi_fabric_attr * fabric_attr ,
1278+ struct fid_fabric * * fabric ,
1279+ void * context )
1280+ {
1281+ int ret ;
1282+
1283+ if (!fabric_attr || !fabric ) {
1284+ return OPAL_ERR_BAD_PARAM ;
1285+ }
1286+
1287+ OPAL_THREAD_LOCK (& opal_common_ofi_mutex );
1288+
1289+ if (fabric_attr -> fabric ) {
1290+ * fabric = fabric_attr -> fabric ;
1291+ opal_common_ofi .fabric_ref_count ++ ;
1292+ opal_output_verbose (1 , opal_common_ofi .output , "Reusing existing fabric: %s" ,
1293+ fabric_attr -> name );
1294+ } else {
1295+ ret = fi_fabric (fabric_attr , fabric , context );
1296+ if (0 != ret ) {
1297+ opal_show_help ("help-mtl-ofi.txt" , "OFI call fail" , true,
1298+ "fi_fabric" ,
1299+ opal_process_info .nodename , __FILE__ , __LINE__ ,
1300+ fi_strerror (- ret ), - ret );
1301+ OPAL_THREAD_UNLOCK (& opal_common_ofi_mutex );
1302+ return OPAL_ERROR ;
1303+ }
1304+ opal_common_ofi .fabric = * fabric ;
1305+ opal_common_ofi .fabric_ref_count = 1 ;
1306+ }
1307+
1308+ OPAL_THREAD_UNLOCK (& opal_common_ofi_mutex );
1309+ return OPAL_SUCCESS ;
1310+ }
1311+
1312+ /**
1313+ * Get or create domain object
1314+ *
1315+ * Reuses existing domain from info->domain_attr->domain if available,
1316+ * otherwise creates new domain using fi_domain().
1317+ *
1318+ * @param fabric (IN) Fabric object
1319+ * @param info (IN) Provider info
1320+ * @param domain (OUT) Domain object (new or existing)
1321+ * @param context (IN) Optional context
1322+ *
1323+ * @return OPAL_SUCCESS or OPAL error code
1324+ */
1325+ int opal_common_ofi_fi_domain (struct fid_fabric * fabric , struct fi_info * info ,
1326+ struct fid_domain * * domain , void * context )
1327+ {
1328+ int ret ;
1329+
1330+ if (!info || !fabric || !domain || !info -> domain_attr ) {
1331+ return OPAL_ERR_BAD_PARAM ;
1332+ }
1333+
1334+ OPAL_THREAD_LOCK (& opal_common_ofi_mutex );
1335+
1336+ if (info -> domain_attr -> domain ) {
1337+ * domain = info -> domain_attr -> domain ;
1338+ opal_common_ofi .domain_ref_count ++ ;
1339+ opal_output_verbose (1 , opal_common_ofi .output , "Reusing existing domain: %s" ,
1340+ info -> domain_attr -> name );
1341+ } else {
1342+ ret = fi_domain (fabric , info , domain , context );
1343+ if (0 != ret ) {
1344+ opal_show_help ("help-mtl-ofi.txt" , "OFI call fail" , true,
1345+ "fi_domain" ,
1346+ opal_process_info .nodename , __FILE__ , __LINE__ ,
1347+ fi_strerror (- ret ), - ret );
1348+ OPAL_THREAD_UNLOCK (& opal_common_ofi_mutex );
1349+ return OPAL_ERROR ;
1350+ }
1351+ opal_common_ofi .domain = * domain ;
1352+ opal_common_ofi .domain_ref_count = 1 ;
1353+ }
1354+
1355+ OPAL_THREAD_UNLOCK (& opal_common_ofi_mutex );
1356+ return OPAL_SUCCESS ;
1357+ }
1358+
1359+ /**
1360+ * Release fabric reference
1361+ *
1362+ * Decrements fabric reference count and closes fabric if count reaches zero.
1363+ *
1364+ * @param fabric (IN) Fabric object to release
1365+ *
1366+ * @return OPAL_SUCCESS or OPAL error code
1367+ */
1368+ int opal_common_ofi_fabric_release (struct fid_fabric * fabric )
1369+ {
1370+ int ret = OPAL_SUCCESS ;
1371+
1372+ if (!fabric ) {
1373+ return OPAL_ERR_BAD_PARAM ;
1374+ }
1375+
1376+ OPAL_THREAD_LOCK (& opal_common_ofi_mutex );
1377+
1378+ if (fabric == opal_common_ofi .fabric && opal_common_ofi .fabric_ref_count > 0 ) {
1379+ opal_common_ofi .fabric_ref_count -- ;
1380+ if (opal_common_ofi .fabric_ref_count == 0 ) {
1381+ ret = fi_close (& fabric -> fid );
1382+ if (0 != ret ) {
1383+ ret = OPAL_ERROR ;
1384+ }
1385+ opal_common_ofi .fabric = NULL ;
1386+ }
1387+ } else {
1388+ ret = fi_close (& fabric -> fid );
1389+ if (0 != ret ) {
1390+ ret = OPAL_ERROR ;
1391+ }
1392+ }
1393+
1394+ OPAL_THREAD_UNLOCK (& opal_common_ofi_mutex );
1395+ return ret ;
1396+ }
1397+
1398+ /**
1399+ * Release domain reference
1400+ *
1401+ * Decrements domain reference count and closes domain if count reaches zero.
1402+ *
1403+ * @param domain (IN) Domain object to release
1404+ *
1405+ * @return OPAL_SUCCESS or OPAL error code
1406+ */
1407+ int opal_common_ofi_domain_release (struct fid_domain * domain )
1408+ {
1409+ int ret = OPAL_SUCCESS ;
1410+
1411+ if (!domain ) {
1412+ return OPAL_ERR_BAD_PARAM ;
1413+ }
1414+
1415+ OPAL_THREAD_LOCK (& opal_common_ofi_mutex );
1416+
1417+ if (domain == opal_common_ofi .domain && opal_common_ofi .domain_ref_count > 0 ) {
1418+ opal_common_ofi .domain_ref_count -- ;
1419+ if (opal_common_ofi .domain_ref_count == 0 ) {
1420+ ret = fi_close (& domain -> fid );
1421+ if (0 != ret ) {
1422+ ret = OPAL_ERROR ;
1423+ }
1424+ opal_common_ofi .domain = NULL ;
1425+ }
1426+ } else {
1427+ ret = fi_close (& domain -> fid );
1428+ if (0 != ret ) {
1429+ ret = OPAL_ERROR ;
1430+ }
1431+ }
1432+
1433+ OPAL_THREAD_UNLOCK (& opal_common_ofi_mutex );
1434+ return ret ;
1435+ }
0 commit comments