Skip to content

Commit 7280996

Browse files
fix: Update handles upon device reset
Outdated info that is present in several modules because of reset result in crash when module info is retrieved after reset. Release and re-create module handles during reset to avoid crash. Related-To: NEO-9100 Signed-off-by: Bellekallu Rajkiran <[email protected]> Source: 84cdf47
1 parent 99eab71 commit 7280996

File tree

24 files changed

+281
-7
lines changed

24 files changed

+281
-7
lines changed

level_zero/sysman/source/linux/zes_os_sysman_imp.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,9 @@ ze_result_t LinuxSysmanImp::gpuProcessCleanup() {
302302
void LinuxSysmanImp::releaseSysmanDeviceResources() {
303303
getSysmanDeviceImp()->pEngineHandleContext->releaseEngines();
304304
getSysmanDeviceImp()->pRasHandleContext->releaseRasHandles();
305+
getSysmanDeviceImp()->pMemoryHandleContext->releaseMemoryHandles();
306+
getSysmanDeviceImp()->pTempHandleContext->releaseTemperatureHandles();
307+
getSysmanDeviceImp()->pPowerHandleContext->releasePowerHandles();
305308
if (!diagnosticsReset) {
306309
getSysmanDeviceImp()->pDiagnosticsHandleContext->releaseDiagnosticsHandles();
307310
}
@@ -335,6 +338,15 @@ ze_result_t LinuxSysmanImp::reInitSysmanDeviceResources() {
335338
if (getSysmanDeviceImp()->pFirmwareHandleContext->isFirmwareInitDone()) {
336339
getSysmanDeviceImp()->pFirmwareHandleContext->init();
337340
}
341+
if (getSysmanDeviceImp()->pMemoryHandleContext->isMemoryInitDone()) {
342+
getSysmanDeviceImp()->pMemoryHandleContext->init(getSubDeviceCount());
343+
}
344+
if (getSysmanDeviceImp()->pTempHandleContext->isTempInitDone()) {
345+
getSysmanDeviceImp()->pTempHandleContext->init(getSubDeviceCount());
346+
}
347+
if (getSysmanDeviceImp()->pPowerHandleContext->isPowerInitDone()) {
348+
getSysmanDeviceImp()->pPowerHandleContext->init(getSubDeviceCount());
349+
}
338350
return ZE_RESULT_SUCCESS;
339351
}
340352

level_zero/sysman/source/memory/sysman_memory.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
namespace L0 {
1515
namespace Sysman {
1616

17-
MemoryHandleContext::~MemoryHandleContext() = default;
17+
MemoryHandleContext::~MemoryHandleContext() {
18+
releaseMemoryHandles();
19+
}
20+
21+
void MemoryHandleContext::releaseMemoryHandles() {
22+
handleList.clear();
23+
}
1824

1925
void MemoryHandleContext::createHandle(bool onSubdevice, uint32_t subDeviceId) {
2026
std::unique_ptr<Memory> pMemory = std::make_unique<MemoryImp>(pOsSysman, onSubdevice, subDeviceId);
@@ -39,6 +45,7 @@ ze_result_t MemoryHandleContext::init(uint32_t subDeviceCount) {
3945
ze_result_t MemoryHandleContext::memoryGet(uint32_t *pCount, zes_mem_handle_t *phMemory) {
4046
std::call_once(initMemoryOnce, [this]() {
4147
this->init(pOsSysman->getSubDeviceCount());
48+
this->memoryInitDone = true;
4249
});
4350
uint32_t handleListSize = static_cast<uint32_t>(handleList.size());
4451
uint32_t numToCopy = std::min(*pCount, handleListSize);

level_zero/sysman/source/memory/sysman_memory.h

+6
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,19 @@ struct MemoryHandleContext {
3838
ze_result_t init(uint32_t subDeviceCount);
3939

4040
ze_result_t memoryGet(uint32_t *pCount, zes_mem_handle_t *phMemory);
41+
void releaseMemoryHandles();
4142

4243
OsSysman *pOsSysman = nullptr;
4344
std::vector<std::unique_ptr<Memory>> handleList = {};
4445

46+
bool isMemoryInitDone() {
47+
return memoryInitDone;
48+
}
49+
4550
private:
4651
void createHandle(bool onSubdevice, uint32_t subDeviceId);
4752
std::once_flag initMemoryOnce;
53+
bool memoryInitDone = false;
4854
};
4955

5056
} // namespace Sysman

level_zero/sysman/source/power/sysman_power.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,15 @@
1515
namespace L0 {
1616
namespace Sysman {
1717

18-
PowerHandleContext::~PowerHandleContext() {
18+
void PowerHandleContext::releasePowerHandles() {
1919
for (Power *pPower : handleList) {
2020
delete pPower;
2121
}
22+
handleList.clear();
23+
}
24+
25+
PowerHandleContext::~PowerHandleContext() {
26+
releasePowerHandles();
2227
}
2328

2429
void PowerHandleContext::createHandle(ze_bool_t isSubDevice, uint32_t subDeviceId) {
@@ -43,6 +48,7 @@ ze_result_t PowerHandleContext::init(uint32_t subDeviceCount) {
4348
void PowerHandleContext::initPower() {
4449
std::call_once(initPowerOnce, [this]() {
4550
this->init(pOsSysman->getSubDeviceCount());
51+
this->powerInitDone = true;
4652
});
4753
}
4854

level_zero/sysman/source/power/sysman_power.h

+7
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,20 @@ struct PowerHandleContext {
4242
ze_result_t powerGet(uint32_t *pCount, zes_pwr_handle_t *phPower);
4343
ze_result_t powerGetCardDomain(zes_pwr_handle_t *phPower);
4444

45+
void releasePowerHandles();
46+
4547
OsSysman *pOsSysman = nullptr;
4648
std::vector<Power *> handleList = {};
4749

50+
bool isPowerInitDone() {
51+
return powerInitDone;
52+
}
53+
4854
private:
4955
void createHandle(ze_bool_t isSubDevice, uint32_t subDeviceId);
5056
std::once_flag initPowerOnce;
5157
void initPower();
58+
bool powerInitDone = false;
5259
};
5360

5461
} // namespace Sysman

level_zero/sysman/source/temperature/sysman_temperature.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,13 @@
1313
namespace L0 {
1414
namespace Sysman {
1515

16-
TemperatureHandleContext::~TemperatureHandleContext() {}
16+
TemperatureHandleContext::~TemperatureHandleContext() {
17+
releaseTemperatureHandles();
18+
}
19+
20+
void TemperatureHandleContext::releaseTemperatureHandles() {
21+
handleList.clear();
22+
}
1723

1824
void TemperatureHandleContext::createHandle(bool onSubdevice, uint32_t subDeviceId, zes_temp_sensors_t type) {
1925
std::unique_ptr<Temperature> pTemperature = std::make_unique<TemperatureImp>(pOsSysman, onSubdevice, subDeviceId, type);
@@ -42,6 +48,7 @@ ze_result_t TemperatureHandleContext::init(uint32_t subDeviceCount) {
4248
ze_result_t TemperatureHandleContext::temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature) {
4349
std::call_once(initTemperatureOnce, [this]() {
4450
this->init(pOsSysman->getSubDeviceCount());
51+
this->tempInitDone = true;
4552
});
4653
uint32_t handleListSize = static_cast<uint32_t>(handleList.size());
4754
uint32_t numToCopy = std::min(*pCount, handleListSize);

level_zero/sysman/source/temperature/sysman_temperature.h

+6
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,19 @@ struct TemperatureHandleContext {
3939
ze_result_t init(uint32_t subDeviceCount);
4040

4141
ze_result_t temperatureGet(uint32_t *pCount, zes_temp_handle_t *phTemperature);
42+
void releaseTemperatureHandles();
4243

4344
OsSysman *pOsSysman = nullptr;
4445
std::vector<std::unique_ptr<Temperature>> handleList = {};
4546

47+
bool isTempInitDone() {
48+
return tempInitDone;
49+
}
50+
4651
private:
4752
void createHandle(bool onSubdevice, uint32_t subDeviceId, zes_temp_sensors_t type);
4853
std::once_flag initTemperatureOnce;
54+
bool tempInitDone = false;
4955
};
5056

5157
} // namespace Sysman

level_zero/sysman/test/unit_tests/sources/memory/linux/test_sysman_memory_prelim.cpp

+36
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,42 @@ TEST_F(SysmanDeviceMemoryFixture, GivenValidMemoryHandleWhenCallingZetSysmanMemo
403403
}
404404
}
405405

406+
TEST_F(SysmanDeviceMemoryFixture, GivenSysmanResourcesAreReleasedAndReInitializedWhenCallingZesSysmanMemoryGetStateThenVerifySysmanMemoryGetStateCallSucceeds) {
407+
pLinuxSysmanImp->releaseSysmanDeviceResources();
408+
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxSysmanImp->reInitSysmanDeviceResources());
409+
410+
VariableBackup<std::map<uint32_t, L0::Sysman::PlatformMonitoringTech *>> pmtBackup(&pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject);
411+
pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject.clear();
412+
auto subdeviceId = 0u;
413+
auto subDeviceCount = pLinuxSysmanImp->getSubDeviceCount();
414+
do {
415+
ze_bool_t onSubdevice = subDeviceCount == 0 ? false : true;
416+
auto pPmt = new MockMemoryPmt(pFsAccess.get(), onSubdevice,
417+
subdeviceId);
418+
pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject.emplace(subdeviceId, pPmt);
419+
} while (++subdeviceId < subDeviceCount);
420+
421+
VariableBackup<L0::Sysman::FirmwareUtil *> backup(&pLinuxSysmanImp->pFwUtilInterface);
422+
pLinuxSysmanImp->pFwUtilInterface = new MockFwUtilInterface();
423+
424+
auto handles = getMemoryHandles(memoryHandleComponentCount);
425+
426+
for (auto handle : handles) {
427+
zes_mem_state_t state;
428+
429+
ze_result_t result = zesMemoryGetState(handle, &state);
430+
431+
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
432+
EXPECT_EQ(state.health, ZES_MEM_HEALTH_OK);
433+
EXPECT_EQ(state.size, NEO::probedSizeRegionOne);
434+
EXPECT_EQ(state.free, NEO::unallocatedSizeRegionOne);
435+
}
436+
437+
pLinuxSysmanImp->releasePmtObject();
438+
delete pLinuxSysmanImp->pFwUtilInterface;
439+
pLinuxSysmanImp->pFwUtilInterface = nullptr;
440+
}
441+
406442
TEST_F(SysmanDeviceMemoryFixture, GivenValidMemoryHandleWhenCallingzesSysmanMemoryGetBandwidthWhenPmtObjectIsNullThenFailureRetuned) {
407443
for (auto &subDeviceIdToPmtEntry : pLinuxSysmanImp->mapOfSubDeviceIdToPmtObject) {
408444
if (subDeviceIdToPmtEntry.second != nullptr) {

level_zero/sysman/test/unit_tests/sources/power/linux/test_zes_power_prelim.cpp

+17
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,23 @@ TEST_F(SysmanDevicePowerFixture, GivenValidPowerHandleWhenGettingPowerEnergyCoun
204204
}
205205
}
206206

207+
TEST_F(SysmanDevicePowerFixture, GivenValidPowerHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumPowerHandleSucceeds) {
208+
uint32_t count = 0;
209+
EXPECT_EQ(ZE_RESULT_SUCCESS, zesDeviceEnumPowerDomains(device->toHandle(), &count, NULL));
210+
EXPECT_EQ(count, powerHandleComponentCount);
211+
212+
for (auto handle : pSysmanDeviceImp->pPowerHandleContext->handleList) {
213+
delete handle;
214+
}
215+
pSysmanDeviceImp->pPowerHandleContext->handleList.clear();
216+
217+
pLinuxSysmanImp->reInitSysmanDeviceResources();
218+
219+
count = 0;
220+
EXPECT_EQ(ZE_RESULT_SUCCESS, zesDeviceEnumPowerDomains(device->toHandle(), &count, NULL));
221+
EXPECT_EQ(count, powerHandleComponentCount);
222+
}
223+
207224
TEST_F(SysmanDevicePowerFixture, GivenSetPowerLimitsWhenGettingPowerLimitsWhenHwmonInterfaceExistThenLimitsSetEarlierAreRetrieved) {
208225
auto handles = getPowerHandles(powerHandleComponentCount);
209226
for (auto handle : handles) {

level_zero/sysman/test/unit_tests/sources/power/windows/test_zes_sysman_power.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ TEST_F(SysmanDevicePowerFixture, GivenComponentCountZeroWhenEnumeratingPowerDoma
5757
EXPECT_EQ(count, powerHandleComponentCount);
5858
}
5959

60+
TEST_F(SysmanDevicePowerFixture, GivenPowerDomainsAreEnumeratedWhenCallingIsPowerInitCompletedThenVerifyPowerInitializationIsCompleted) {
61+
init(true);
62+
63+
uint32_t count = 0;
64+
EXPECT_EQ(zesDeviceEnumPowerDomains(pSysmanDevice->toHandle(), &count, nullptr), ZE_RESULT_SUCCESS);
65+
EXPECT_EQ(count, powerHandleComponentCount);
66+
67+
EXPECT_EQ(true, pSysmanDeviceImp->pPowerHandleContext->isPowerInitDone());
68+
}
69+
6070
TEST_F(SysmanDevicePowerFixture, GivenInvalidComponentCountWhenEnumeratingPowerDomainThenValidCountIsReturnedAndVerifySysmanPowerGetCallSucceeds) {
6171
init(true);
6272

level_zero/sysman/test/unit_tests/sources/temperature/linux/test_zes_temperature.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,20 @@ class SysmanDeviceTemperatureFixture : public SysmanDeviceFixture {
196196
}
197197
};
198198

199+
HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidPowerHandleAndHandleCountZeroWhenCallingReInitThenValidCountIsReturnedAndVerifyzesDeviceEnumPowerHandleSucceeds, IsPVC) {
200+
uint32_t count = 0;
201+
EXPECT_EQ(ZE_RESULT_SUCCESS, zesDeviceEnumTemperatureSensors(device->toHandle(), &count, NULL));
202+
EXPECT_EQ(count, handleComponentCountForSingleTileDevice);
203+
204+
pSysmanDeviceImp->pTempHandleContext->handleList.clear();
205+
206+
pLinuxSysmanImp->reInitSysmanDeviceResources();
207+
208+
count = 0;
209+
EXPECT_EQ(ZE_RESULT_SUCCESS, zesDeviceEnumTemperatureSensors(device->toHandle(), &count, NULL));
210+
EXPECT_EQ(count, handleComponentCountForSingleTileDevice);
211+
}
212+
199213
HWTEST2_F(SysmanDeviceTemperatureFixture, GivenValidTempHandleWhenGettingGPUAndGlobalTemperatureThenValidTemperatureReadingsRetrieved, IsDG1) {
200214
auto handles = getTempHandles(handleComponentCountForNoSubDevices);
201215
for (auto &handle : handles) {

level_zero/sysman/test/unit_tests/sources/temperature/windows/test_zes_temperature.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ TEST_F(SysmanDeviceTemperatureFixture, GivenComponentCountZeroWhenEnumeratingTem
4848
EXPECT_EQ(count, temperatureHandleComponentCount);
4949
}
5050

51+
TEST_F(SysmanDeviceTemperatureFixture, GivenTempDomainsAreEnumeratedWhenCallingIsTempInitCompletedThenVerifyTempInitializationIsCompleted) {
52+
uint32_t count = 0;
53+
EXPECT_EQ(zesDeviceEnumTemperatureSensors(pSysmanDevice->toHandle(), &count, nullptr), ZE_RESULT_SUCCESS);
54+
EXPECT_EQ(count, temperatureHandleComponentCount);
55+
56+
EXPECT_EQ(true, pSysmanDeviceImp->pTempHandleContext->isTempInitDone());
57+
}
58+
5159
TEST_F(SysmanDeviceTemperatureFixture, GivenInvalidComponentCountWhenEnumeratingTemperatureSensorsThenValidCountIsReturnedAndVerifySysmanPowerGetCallSucceeds) {
5260
uint32_t count = 0;
5361
EXPECT_EQ(zesDeviceEnumTemperatureSensors(pSysmanDevice->toHandle(), &count, nullptr), ZE_RESULT_SUCCESS);

level_zero/tools/source/sysman/linux/os_sysman_imp.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,9 @@ ze_result_t LinuxSysmanImp::gpuProcessCleanup() {
293293
void LinuxSysmanImp::releaseSysmanDeviceResources() {
294294
getSysmanDeviceImp()->pEngineHandleContext->releaseEngines();
295295
getSysmanDeviceImp()->pRasHandleContext->releaseRasHandles();
296+
getSysmanDeviceImp()->pMemoryHandleContext->releaseMemoryHandles();
297+
getSysmanDeviceImp()->pTempHandleContext->releaseTemperatureHandles();
298+
getSysmanDeviceImp()->pPowerHandleContext->releasePowerHandles();
296299
if (!diagnosticsReset) {
297300
getSysmanDeviceImp()->pDiagnosticsHandleContext->releaseDiagnosticsHandles();
298301
}
@@ -341,6 +344,15 @@ void LinuxSysmanImp::reInitSysmanDeviceResources() {
341344
if (getSysmanDeviceImp()->pFirmwareHandleContext->isFirmwareInitDone()) {
342345
getSysmanDeviceImp()->pFirmwareHandleContext->init();
343346
}
347+
if (getSysmanDeviceImp()->pMemoryHandleContext->isMemoryInitDone()) {
348+
getSysmanDeviceImp()->pMemoryHandleContext->init(getSysmanDeviceImp()->deviceHandles);
349+
}
350+
if (getSysmanDeviceImp()->pTempHandleContext->isTempInitDone()) {
351+
getSysmanDeviceImp()->pTempHandleContext->init(getSysmanDeviceImp()->deviceHandles);
352+
}
353+
if (getSysmanDeviceImp()->pPowerHandleContext->isPowerInitDone()) {
354+
getSysmanDeviceImp()->pPowerHandleContext->init(getSysmanDeviceImp()->deviceHandles, getCoreDeviceHandle());
355+
}
344356
}
345357

346358
ze_result_t LinuxSysmanImp::initDevice() {

level_zero/tools/source/sysman/memory/memory.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414

1515
namespace L0 {
1616

17-
MemoryHandleContext::~MemoryHandleContext() = default;
17+
MemoryHandleContext::~MemoryHandleContext() {
18+
releaseMemoryHandles();
19+
}
20+
21+
void MemoryHandleContext::releaseMemoryHandles() {
22+
handleList.clear();
23+
}
1824

1925
void MemoryHandleContext::createHandle(ze_device_handle_t deviceHandle) {
2026
std::unique_ptr<Memory> pMemory = std::make_unique<MemoryImp>(pOsSysman, deviceHandle);
@@ -33,6 +39,7 @@ ze_result_t MemoryHandleContext::init(std::vector<ze_device_handle_t> &deviceHan
3339
ze_result_t MemoryHandleContext::memoryGet(uint32_t *pCount, zes_mem_handle_t *phMemory) {
3440
std::call_once(initMemoryOnce, [this]() {
3541
this->init(pOsSysman->getDeviceHandles());
42+
this->memoryInitDone = true;
3643
});
3744
uint32_t handleListSize = static_cast<uint32_t>(handleList.size());
3845
uint32_t numToCopy = std::min(*pCount, handleListSize);

level_zero/tools/source/sysman/memory/memory.h

+6
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,20 @@ struct MemoryHandleContext {
3636
ze_result_t init(std::vector<ze_device_handle_t> &deviceHandles);
3737

3838
ze_result_t memoryGet(uint32_t *pCount, zes_mem_handle_t *phMemory);
39+
void releaseMemoryHandles();
3940

4041
OsSysman *pOsSysman = nullptr;
4142
bool isLmemSupported = false;
4243
std::vector<std::unique_ptr<Memory>> handleList = {};
4344

45+
bool isMemoryInitDone() {
46+
return memoryInitDone;
47+
}
48+
4449
private:
4550
void createHandle(ze_device_handle_t deviceHandle);
4651
std::once_flag initMemoryOnce;
52+
bool memoryInitDone = false;
4753
};
4854

4955
} // namespace L0

level_zero/tools/source/sysman/power/power.cpp

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2022 Intel Corporation
2+
* Copyright (C) 2020-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -14,10 +14,15 @@
1414

1515
namespace L0 {
1616

17-
PowerHandleContext::~PowerHandleContext() {
17+
void PowerHandleContext::releasePowerHandles() {
1818
for (Power *pPower : handleList) {
1919
delete pPower;
2020
}
21+
handleList.clear();
22+
}
23+
24+
PowerHandleContext::~PowerHandleContext() {
25+
releasePowerHandles();
2126
}
2227

2328
void PowerHandleContext::createHandle(ze_device_handle_t deviceHandle) {
@@ -43,6 +48,7 @@ ze_result_t PowerHandleContext::init(std::vector<ze_device_handle_t> &deviceHand
4348
void PowerHandleContext::initPower() {
4449
std::call_once(initPowerOnce, [this]() {
4550
this->init(pOsSysman->getDeviceHandles(), pOsSysman->getCoreDeviceHandle());
51+
this->powerInitDone = true;
4652
});
4753
}
4854

0 commit comments

Comments
 (0)