add TensorRT configuration to OrtProviderOptions (microsoft#6979)

stevenlix · web-flow · commit 2e38bf5e23a9 · 2021-03-16T17:16:28.000-07:00
* add TensorRT configurations in provider options

* Update ort_test_session.cc

* Update tensorrt_execution_provider.cc

* Update onnxruntime_pybind_state.cc

* Update main.cc
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -289,9 +289,15 @@ typedef struct OrtROCMProviderOptions {
 /// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT
 /// </summary>
 typedef struct OrtTensorRTProviderOptions {
-  int device_id;
-  int has_user_compute_stream;
-  void* user_compute_stream;
+  int device_id;                                  // cuda device id.
+  int has_user_compute_stream;                    // indicator of user specified CUDA compute stream.
+  void* user_compute_stream;                      // user specified CUDA compute stream.
+  int has_trt_options;                            // override environment variables with following TensorRT settings at runtime.
+  size_t trt_max_workspace_size;                  // maximum workspace size for TensorRT.
+  int trt_fp16_enable;                            // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
+  int trt_int8_enable;                            // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
+  const char* trt_int8_calibration_table_name;    // TensorRT INT8 calibration table name.
+  int trt_int8_use_native_calibration_table;      // use native TensorRT generated calibration table. Default 0 = false, nonzero = true
 } OrtTensorRTProviderOptions;
 
 /// <summary>
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -404,30 +404,50 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     min_subgraph_size_ = std::stoi(min_subgraph_size_env);
   }
 
-  const std::string max_workspace_size_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxWorkspaceSize);
-  if (!max_workspace_size_env.empty()) {
-    max_workspace_size_ = std::stoull(max_workspace_size_env);
+  if (info.has_trt_options) {
+    max_workspace_size_ = info.max_workspace_size;
+  } else {
+    const std::string max_workspace_size_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxWorkspaceSize);
+    if (!max_workspace_size_env.empty()) {
+      max_workspace_size_ = std::stoull(max_workspace_size_env);
+    }
   }
 
-  const std::string fp16_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kFP16Enable);
-  if (!fp16_enable_env.empty()) {
-    fp16_enable_ = (std::stoi(fp16_enable_env) == 0 ? false : true);
+  if (info.has_trt_options) {
+    fp16_enable_ = info.fp16_enable;
+  } else {
+    const std::string fp16_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kFP16Enable);
+    if (!fp16_enable_env.empty()) {
+      fp16_enable_ = (std::stoi(fp16_enable_env) == 0 ? false : true);
+    }
   }
 
-  const std::string int8_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8Enable);
-  if (!int8_enable_env.empty()) {
-    int8_enable_ = (std::stoi(int8_enable_env) == 0 ? false : true);
+  if (info.has_trt_options) {
+    int8_enable_ = info.int8_enable;
+  } else {
+    const std::string int8_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8Enable);
+    if (!int8_enable_env.empty()) {
+      int8_enable_ = (std::stoi(int8_enable_env) == 0 ? false : true);
+    }
   }
 
   if (int8_enable_) {
-    const std::string int8_calibration_cache_name_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8CalibrationTableName);
-    if (!int8_calibration_cache_name_env.empty()) {
-      int8_calibration_cache_name_ = int8_calibration_cache_name_env;
+    if (info.has_trt_options) {
+      int8_calibration_cache_name_ = info.int8_calibration_table_name;
+    } else {
+      const std::string int8_calibration_cache_name_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8CalibrationTableName);
+      if (!int8_calibration_cache_name_env.empty()) {
+        int8_calibration_cache_name_ = int8_calibration_cache_name_env;
+      }
     }
 
-    const std::string int8_use_native_tensorrt_calibration_table_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8UseNativeTensorrtCalibrationTable);
-    if (!int8_use_native_tensorrt_calibration_table_env.empty()) {
-      int8_use_native_tensorrt_calibration_table_ = (std::stoi(int8_use_native_tensorrt_calibration_table_env) == 0 ? false : true);
+    if (info.has_trt_options) {
+      int8_use_native_tensorrt_calibration_table_ = info.int8_use_native_calibration_table;
+    } else {
+      const std::string int8_use_native_tensorrt_calibration_table_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kINT8UseNativeTensorrtCalibrationTable);
+      if (!int8_use_native_tensorrt_calibration_table_env.empty()) {
+        int8_use_native_tensorrt_calibration_table_ = (std::stoi(int8_use_native_tensorrt_calibration_table_env) == 0 ? false : true);
+      }
     }
   }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -71,6 +71,12 @@ struct TensorrtExecutionProviderInfo {
   int device_id{0};
   bool has_user_compute_stream{false};
   void* user_compute_stream{nullptr};
+  bool has_trt_options{false};
+  size_t max_workspace_size{1 << 30};
+  bool fp16_enable{false};
+  bool int8_enable{false}; 
+  std::string int8_calibration_table_name{""};
+  bool int8_use_native_calibration_table{false};
 };
 
 // Information to construct kernel function state.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -49,6 +49,12 @@ struct Tensorrt_Provider : Provider {
     info.device_id = options.device_id;
     info.has_user_compute_stream = options.has_user_compute_stream;
     info.user_compute_stream = options.user_compute_stream;
+    info.has_trt_options = options.has_trt_options;
+    info.max_workspace_size = options.trt_max_workspace_size;
+    info.fp16_enable = options.trt_fp16_enable;
+    info.int8_enable = options.trt_int8_enable;
+    info.int8_calibration_table_name = options.trt_int8_calibration_table_name == nullptr ? "" : options.trt_int8_calibration_table_name;
+    info.int8_use_native_calibration_table = options.trt_int8_use_native_calibration_table;
     return std::make_shared<TensorrtProviderFactory>(info);
   }
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -490,7 +490,61 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
                                           sess->GetSessionOptions().enable_cpu_mem_arena));
     } else if (type == kTensorrtExecutionProvider) {
 #ifdef USE_TENSORRT
-      OrtTensorRTProviderOptions params{0, 0, nullptr};
+      OrtTensorRTProviderOptions params{0, 0, nullptr, 0, 1 << 30, 0, 0, nullptr, 0};
+      std::string trt_int8_calibration_table_name;
+      auto it = provider_options_map.find(type);
+      if (it != provider_options_map.end()) {
+        for (auto option : it->second) {
+          if (option.first == "has_trt_options") {
+            if (option.second == "True" || option.second == "true") {
+              params.has_trt_options = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.has_trt_options = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'has_trt_options' should be a boolean i.e. 'True' or 'False'. Default value is False.\n");
+            }
+          } else if (option.first == "trt_max_workspace_size") {
+            if (!option.second.empty()) {
+              params.trt_max_workspace_size = std::stoull(option.second);
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_max_workspace_size' should be a number in byte i.e. '1073741824'.\n");
+            }
+          } else if (option.first == "trt_fp16_enable") {	
+            if (option.second == "True" || option.second == "true") {
+              params.trt_fp16_enable = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_fp16_enable = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_fp16_enable' should be a boolean i.e. 'True' or 'False'. Default value is False.\n");
+            }
+          } else if (option.first == "trt_int8_enable") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_int8_enable = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_int8_enable = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_int8_enable' should be a boolean i.e. 'True' or 'False'. Default value is False.\n");
+            }
+          } else if (option.first == "trt_int8_calibration_table_name") {
+            if (!option.second.empty()) {
+              trt_int8_calibration_table_name = option.second;
+              params.trt_int8_calibration_table_name = trt_int8_calibration_table_name.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_int8_calibration_table_name' should be a file name i.e. 'cal_table'.\n");
+            }
+          } else if (option.first == "trt_int8_use_native_calibration_table") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_int8_use_native_calibration_table = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_int8_use_native_calibration_table = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_int8_use_native_calibration_table' should be a boolean i.e. 'True' or 'False'. Default value is False.\n");
+            }
+          } else {
+            ORT_THROW("Invalid TensorRT EP option: ", option.first);
+          }
+        }
+      }
       RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params));
 #endif
     } else if (type == kMIGraphXExecutionProvider) {
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
@@ -312,7 +312,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
       OrtTensorRTProviderOptions tensorrt_options{
           0,
           0,
-          nullptr};
+          nullptr,
+          0,
+          1 << 30,
+          0,
+          0,
+          nullptr,
+          0};
 
       OrtCUDAProviderOptions cuda_options{
           0,
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
@@ -62,6 +62,14 @@ namespace perftest {
       "\t    [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
       "\t [Example] [For OpenVINO EP] -e openvino -i 'device_type|CPU_FP32 enable_vpu_fast_compile|true num_of_threads|5'\n"
+      "\t    [TensorRT only] [use_trt_options]: Overrides TensorRT environment variables (if any) with following settings at runtime.\n"		  
+      "\t    [TensorRT only] [trt_max_workspace_size]: Set TensorRT maximum workspace size in byte.\n"	  
+      "\t    [TensorRT only] [trt_fp16_enable]: Enable TensorRT FP16 precision.\n"
+      "\t    [TensorRT only] [trt_int8_enable]: Enable TensorRT INT8 precision.\n"
+      "\t    [TensorRT only] [trt_int8_calibration_table_name]: Specify INT8 calibration table name.\n"
+      "\t    [TensorRT only] [trt_int8_use_native_calibration_table]: Use Native TensorRT calibration table.\n"
+      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
+      "\t [Example] [For TensorRT EP] -e tensorrt -i 'use_trt_options|true trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false'\n"
       "\t-h: help\n");
 }
 #ifdef _WIN32
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
@@ -62,8 +62,100 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #endif
   } else if (provider_name == onnxruntime::kTensorrtExecutionProvider) {
 #ifdef USE_TENSORRT
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options, 0));
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    bool has_trt_options = false;
+    size_t trt_max_workspace_size = 1 << 30;
+    bool trt_fp16_enable = false;
+    bool trt_int8_enable = false;
+    std::string trt_int8_calibration_table_name = "";
+    bool trt_int8_use_native_calibration_table = false;
+
+    #ifdef _MSC_VER
+    std::string ov_string = ToMBString(performance_test_config.run_config.ep_runtime_config_string);
+    #else
+    std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
+    #endif
+    std::istringstream ss(ov_string);
+    std::string token;
+    while (ss >> token) {
+      if(token == "") {
+        continue;
+      }
+      auto pos = token.find("|");
+      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
+        ORT_THROW("[ERROR] [TensorRT] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
+      }
+
+      auto key = token.substr(0,pos);
+      auto value = token.substr(pos+1);
+      if (key == "has_trt_options") {
+        if(value == "true" || value == "True"){
+          has_trt_options = true;
+        } else if (value == "false" || value == "False") {
+          has_trt_options = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'has_trt_options' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_max_workspace_size") {
+        if(!value.empty()) {
+          trt_max_workspace_size = std::stoull(value);
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_max_workspace_size' should be a number.\n");
+        }
+      } else if (key == "trt_fp16_enable") {
+        if(value == "true" || value == "True"){
+          trt_fp16_enable = true;
+        } else if (value == "false" || value == "False") {
+          trt_fp16_enable = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_fp16_enable' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_int8_enable") {
+        if(value == "true" || value == "True"){
+          trt_int8_enable = true;
+        } else if (value == "false" || value == "False") {
+          trt_int8_enable = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_int8_enable' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_int8_calibration_table_name") {
+        if(!value.empty()) {
+          trt_int8_calibration_table_name = value;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_int8_calibration_table_name' should be a non-emtpy string.\n");
+        }
+      } else if (key == "trt_int8_use_native_calibration_table") {
+        if(value == "true" || value == "True"){
+          trt_int8_use_native_calibration_table = true;
+        } else if (value == "false" || value == "False") {
+          trt_int8_use_native_calibration_table = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_int8_use_native_calibration_table' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else {
+          ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['use_trt_options', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table'] \n");
+      }
+    }
+    OrtTensorRTProviderOptions tensorrt_options;
+    tensorrt_options.device_id = 0;
+    tensorrt_options.has_user_compute_stream = 0;
+    tensorrt_options.user_compute_stream = nullptr;
+    tensorrt_options.has_trt_options = has_trt_options;
+    tensorrt_options.trt_max_workspace_size = trt_max_workspace_size;
+    tensorrt_options.trt_fp16_enable = trt_fp16_enable;
+    tensorrt_options.trt_int8_enable = trt_int8_enable;
+    tensorrt_options.trt_int8_calibration_table_name = trt_int8_calibration_table_name.c_str();
+    tensorrt_options.trt_int8_use_native_calibration_table = trt_int8_use_native_calibration_table;
+    session_options.AppendExecutionProvider_TensorRT(tensorrt_options);
+
+    OrtCUDAProviderOptions cuda_options{
+        0,
+        static_cast<OrtCudnnConvAlgoSearch>(performance_test_config.run_config.cudnn_conv_algo),
+        std::numeric_limits<size_t>::max(),
+        0,
+        !performance_test_config.run_config.do_cuda_copy_in_separate_stream,
+        0,
+        nullptr};
+    session_options.AppendExecutionProvider_CUDA(cuda_options);
 #else
     ORT_THROW("TensorRT is not supported in this build\n");
 #endif
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
@@ -43,7 +43,7 @@ std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_aren
 
 std::unique_ptr<IExecutionProvider> DefaultTensorrtExecutionProvider() {
 #ifdef USE_TENSORRT
-  OrtTensorRTProviderOptions params{0, 0, nullptr};
+  OrtTensorRTProviderOptions params{0, 0, nullptr, 0, 1 << 30, 0, 0, nullptr, 0};
   if (auto factory = CreateExecutionProviderFactory_Tensorrt(&params))
     return factory->CreateProvider();
 #endif