Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1. Support GPU indexes in -i option #864

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion babel.so/src/action.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ int mem_action::get_all_selected_gpus(void) {

// iterate over all available & compatible AMD GPUs
amd_gpus_found = fetch_gpu_list(hip_num_gpu_devices, mem_gpus_device_index,
property_device, property_device_id, property_device_all);
property_device, property_device_id, property_device_all,
property_device_index, property_device_index_all);
if (amd_gpus_found) {
if (do_mem_stress_test(mem_gpus_device_index))
return 0;
Expand Down
4 changes: 2 additions & 2 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
-g --listGpus List all the GPUs available in the machine, that RVS supports and
has visibility.

-i --indexes Comma separated list of GPU ids to run test on. This overrides
the device values specified for every actions in the
-i --indexes Comma separated list of GPU ids/indexes to run test on. This overrides
the device/device_index values specified for every actions in the
configuration file, including the ‘all’ value.

-j --json Generate output file in JSON format.
Expand Down
4 changes: 2 additions & 2 deletions docs/ug1main.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ The range is 0-5 with 5 being the highest verbose level.
that RVS supports and has visibility.
</td></tr>

<tr><td>-i</td><td>--indexes</td><td>Comma separated list of GPU ids to run test on.
This overrides the device values specified for every actions in the
<tr><td>-i</td><td>--indexes</td><td>Comma separated list of GPU ids/indexes to run test on.
This overrides the device/device_index values specified for every actions in the
configuration file, including the ‘all’ value.
</td></tr>

Expand Down
259 changes: 147 additions & 112 deletions gpup.so/src/action.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,159 +342,194 @@ int gpup_action::property_io_links_get_value(uint16_t gpu_id) {
* @return run result
*/
int gpup_action::run(void) {
std::string msg;
int sts = 0;
rvs::action_result_t action_result;

// get the action name
if (property_get(RVS_CONF_NAME_KEY, &action_name)) {
msg = "Action name missing";
rvs::lp::Err(msg, MODULE_NAME_CAPS);
std::string msg;
int sts = 0;
rvs::action_result_t action_result;

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);
// get the action name
if (property_get(RVS_CONF_NAME_KEY, &action_name)) {
msg = "Action name missing";
rvs::lp::Err(msg, MODULE_NAME_CAPS);

return -1;
}
// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);

return -1;
}

// get <device> property value (a list of gpu id)
if (int sts = property_get_device()) {
switch (sts) {
// get <device> property value (a list of gpu id)
if (int sts = property_get_device()) {
switch (sts) {
case 1:
msg = "Invalid 'device' key value.";
break;
case 2:
msg = "Missing 'device' key.";
break;
}
rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
}
rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);
// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);

return -1;
return -1;
}

if (int sts = property_get_device_index()) {
switch (sts) {
case 1:
msg = "Invalid 'device_index' key value.";
break;
case 2:
msg = "Missing 'device_index' key.";
break;
}

// get the <deviceid> property value if provided
if (property_get_int<uint16_t>(RVS_CONF_DEVICEID_KEY,
&property_device_id, 0u)) {
msg = "Invalid 'deviceid' key value.";
rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
property_device_index_all = true;
rvs::lp::Log(msg, rvs::loginfo);
}

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);
if (property_device_index.size() || property_device.size()) {
property_device_all = false;
property_device_index_all = false;
}

return -1;
}
// get the <deviceid> property value if provided
if (property_get_int<uint16_t>(RVS_CONF_DEVICEID_KEY,
&property_device_id, 0u)) {
msg = "Invalid 'deviceid' key value.";
rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);

// extract properties and io_links properties names
property_split(JSON_PROP_NODE_NAME);
property_split(JSON_IO_LINK_PROP_NODE_NAME);
// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);

bjson = false; // already initialized in the default constructor
return -1;
}

// check for -j flag (json logging)
if (has_property("cli.-j")) {
bjson = true;
}
// extract properties and io_links properties names
property_split(JSON_PROP_NODE_NAME);
property_split(JSON_IO_LINK_PROP_NODE_NAME);

// get all AMD GPUs
vector<uint16_t> gpu;
gpu_get_all_gpu_id(&gpu);
bool b_gpu_found = false;
if (bjson){
if (rvs::lp::JsonActionStartNodeCreate(MODULE_NAME, action_name.c_str())){
rvs::lp::Err("json start create failed", MODULE_NAME_CAPS, action_name);
return 1;
}
bjson = false; // already initialized in the default constructor

}
// check for -j flag (json logging)
if (has_property("cli.-j")) {
bjson = true;
}

// iterate over AMD GPUs
for (auto it = gpu.begin(); it !=gpu.end(); ++it) {
// filter by gpu_id if needed
if (property_device_id > 0) {
uint16_t dev_id;
if (!rvs::gpulist::gpu2device(*it, &dev_id)) {
if (dev_id != property_device_id) {
continue;
}
} else {
msg = "Device ID not found for GPU " + std::to_string(*it);
rvs::lp::Err(msg, MODULE_NAME, action_name);
// get all AMD GPUs
vector<uint16_t> gpu_id;
vector<uint16_t> gpu_idx;

gpu_get_all_gpu_id(&gpu_id);
gpu_get_all_gpu_idx(&gpu_idx);

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);
bool b_gpu_found = false;

return -1;
if (bjson){
if (rvs::lp::JsonActionStartNodeCreate(MODULE_NAME, action_name.c_str())){
rvs::lp::Err("json start create failed", MODULE_NAME_CAPS, action_name);
return 1;
}
}

// iterate over AMD GPUs
for (size_t i = 0; i < gpu_id.size(); i++) {

// filter by gpu_id if needed
if (property_device_id > 0) {
uint16_t dev_id;
if (!rvs::gpulist::gpu2device(gpu_id[i], &dev_id)) {
if (dev_id != property_device_id) {
continue;
}
} else {
msg = "Device ID not found for GPU " + std::to_string(gpu_id[i]);
rvs::lp::Err(msg, MODULE_NAME, action_name);

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);

return -1;
}
}

// filter by device if needed
if (!property_device_all) {
if (std::find(property_device.begin(), property_device.end(), *it) ==
// filter by device if needed
if (!property_device_all && property_device.size()) {
if (std::find(property_device.begin(), property_device.end(), gpu_id[i]) ==
property_device.end()) {
continue;
}
continue;
}
b_gpu_found = true;
}

if (bjson){
json_root_node = json_node_create(std::string(module_name),
action_name.c_str(), rvs::logresults);
if(json_root_node)
rvs::lp::AddString(json_root_node, RVS_JSON_LOG_GPU_ID_KEY, std::to_string(*it));
}
// properties values
sts = property_get_value(*it);
sts = property_io_links_get_value(*it);

if (bjson) { // json logging stuff
RVSTRACE_
rvs::lp::AddString(json_root_node,"pass" , (sts == 0) ? "true" : "false");
rvs::lp::LogRecordFlush(json_root_node);
json_root_node = nullptr;
// filter by device if needed
if (!property_device_index_all && property_device_index.size()) {
if (std::find(property_device_index.cbegin(), property_device_index.cend(), gpu_idx[i]) ==
property_device_index.cend()) {
continue;
}
}

if (sts) {
RVSTRACE_
}
} // for all gpu_id
if(bjson){
rvs::lp::JsonActionEndNodeCreate();
b_gpu_found = true;

if (bjson){
json_root_node = json_node_create(std::string(module_name),
action_name.c_str(), rvs::logresults);
if(json_root_node)
rvs::lp::AddString(json_root_node, RVS_JSON_LOG_GPU_ID_KEY, std::to_string(gpu_id[i]));
}
if (!b_gpu_found) {
msg = "No device matches criteria from configuration. ";
rvs::lp::Err(msg, MODULE_NAME, action_name);
// properties values
sts = property_get_value(gpu_id[i]);
sts = property_io_links_get_value(gpu_id[i]);

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);
if (bjson) { // json logging stuff
RVSTRACE_
rvs::lp::AddString(json_root_node,"pass" , (sts == 0) ? "true" : "false");
rvs::lp::LogRecordFlush(json_root_node);
json_root_node = nullptr;
}

return -1;
if (sts) {
RVSTRACE_
}
} // for all gpu_id

if(bjson){
rvs::lp::JsonActionEndNodeCreate();
}
if (!b_gpu_found) {
msg = "No device matches criteria from configuration. ";
rvs::lp::Err(msg, MODULE_NAME, action_name);

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = (!sts) ? rvs::actionstatus::ACTION_SUCCESS : rvs::actionstatus::ACTION_FAILED;
action_result.output = "GPUP Module action " + action_name + " completed";
action_result.status = rvs::actionstatus::ACTION_FAILED;
action_result.output = msg;
action_callback(&action_result);

return sts;
return -1;
}

// Action callback
action_result.state = rvs::actionstate::ACTION_COMPLETED;
action_result.status = (!sts) ? rvs::actionstatus::ACTION_SUCCESS : rvs::actionstatus::ACTION_FAILED;
action_result.output = "GPUP Module action " + action_name + " completed";
action_callback(&action_result);

return sts;
}

/*
Expand Down
4 changes: 3 additions & 1 deletion gst.so/src/action.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -587,8 +587,10 @@ int gst_action::get_all_selected_gpus(void) {
hip_num_gpu_devices = get_num_amd_gpu_devices();
if (hip_num_gpu_devices < 1)
return hip_num_gpu_devices;

amd_gpus_found = fetch_gpu_list(hip_num_gpu_devices, gst_gpus_device_index,
property_device, property_device_id, property_device_all);
property_device, property_device_id, property_device_all,
property_device_index, property_device_index_all);
// iterate over all available & compatible AMD GPUs

if (amd_gpus_found) {
Expand Down
5 changes: 3 additions & 2 deletions iet.so/src/action.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,12 +659,13 @@ int iet_action::get_all_selected_gpus(void) {
hipGetDeviceCount(&hip_num_gpu_devices);
if (hip_num_gpu_devices < 1)
return hip_num_gpu_devices;

rsmi_init(0);
// find compatible GPUs to run edp tests
amd_gpus_found = fetch_gpu_list(hip_num_gpu_devices, iet_gpus_device_index,
property_device, property_device_id, property_device_all, true); // MCM checks
property_device, property_device_id, property_device_all,
property_device_index, property_device_index_all, true); // MCM checks
if(!amd_gpus_found){

msg = "No devices match criteria from the test configuation.";
rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
rsmi_shut_down();
Expand Down
Loading