Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Allow per-model metric configuration #405

Merged
merged 11 commits into from
Nov 6, 2024
3 changes: 2 additions & 1 deletion src/backend_model_instance.cc
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@ TritonModelInstance::TritonModelInstance(
model_->Server()->ResponseCacheEnabled();
MetricModelReporter::Create(
model_->ModelId(), model_->Version(), id, response_cache_enabled,
model_->IsDecoupled(), model_->Config().metric_tags(), &reporter_);
model_->IsDecoupled(), model_->Config().metric_tags(),
model_->Config().model_metrics(), &reporter_);
}
#endif // TRITON_ENABLE_METRICS
}
Expand Down
1 change: 1 addition & 0 deletions src/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ constexpr char kInitialStateFolder[] = "initial_state";
// Metric names
constexpr char kPendingRequestMetric[] = "inf_pending_request_count";
constexpr char kModelLoadTimeMetric[] = "model_load_time";
constexpr char kFirstResponseHistogram[] = "first_response_histogram";

constexpr uint64_t NANOS_PER_SECOND = 1000000000;
constexpr uint64_t NANOS_PER_MILLIS = 1000000;
Expand Down
2 changes: 1 addition & 1 deletion src/ensemble_scheduler/ensemble_scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,7 @@ EnsembleScheduler::EnsembleScheduler(
MetricModelReporter::Create(
model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU,
false /* response_cache_enabled */, is_decoupled, config.metric_tags(),
&metric_reporter_);
config.model_metrics(), &metric_reporter_);
}
#endif // TRITON_ENABLE_METRICS

Expand Down
2 changes: 1 addition & 1 deletion src/infer_response.cc
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ InferenceResponse::UpdateResponseMetrics() const
.count();
if (auto reporter = model_->MetricReporter()) {
reporter->ObserveHistogram(
"first_response_histogram",
kFirstResponseHistogram,
(now_ns - infer_start_ns_) / NANOS_PER_MILLIS);
}
}
Expand Down
39 changes: 32 additions & 7 deletions src/metric_model_reporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ namespace triton { namespace core {
//
void
MetricReporterConfig::ParseConfig(
bool response_cache_enabled, bool is_decoupled)
bool response_cache_enabled, bool is_decoupled,
const inference::ModelMetrics& model_metrics)
{
// Global config only for now in config map
auto metrics_config_map = Metrics::ConfigMap();
Expand Down Expand Up @@ -74,6 +75,26 @@ MetricReporterConfig::ParseConfig(
// Set flag to signal to stats aggregator if caching is enabled or not
cache_enabled_ = response_cache_enabled;
is_decoupled_ = is_decoupled;

// Override default histogram options if set in model_metrics.
for (const auto& metric_control : model_metrics.metric_control()) {
const std::string& family_name =
metric_control.metric_identifier().family();

// If family name exists, override with new options.
if (metric_map_.find(family_name) != metric_map_.end()) {
// Copy protobuf RepeatedField to std::vector
const auto& buckets_proto = metric_control.histogram_options().buckets();
const prometheus::Histogram::BucketBoundaries buckets(
buckets_proto.begin(), buckets_proto.end());
histogram_options_[metric_map_.at(family_name)] = buckets;
} else {
// metric_control config may be extended to support backend metrics.
LOG_WARNING << "Metric family '" << family_name
<< "' in 'metric_identifier' is not a customizable metric in "
"Triton core.";
}
}
}

prometheus::Summary::Quantiles
Expand Down Expand Up @@ -120,6 +141,7 @@ MetricModelReporter::Create(
const ModelIdentifier& model_id, const int64_t model_version,
const int device, bool response_cache_enabled, bool is_decoupled,
const triton::common::MetricTagsMap& model_tags,
const inference::ModelMetrics& model_metrics,
std::shared_ptr<MetricModelReporter>* metric_model_reporter)
{
static std::mutex mtx;
Expand Down Expand Up @@ -148,21 +170,22 @@ MetricModelReporter::Create(

metric_model_reporter->reset(new MetricModelReporter(
model_id, model_version, device, response_cache_enabled, is_decoupled,
model_tags));
model_tags, model_metrics));
reporter_map.insert({hash_labels, *metric_model_reporter});
return Status::Success;
}

MetricModelReporter::MetricModelReporter(
const ModelIdentifier& model_id, const int64_t model_version,
const int device, bool response_cache_enabled, bool is_decoupled,
const triton::common::MetricTagsMap& model_tags)
const triton::common::MetricTagsMap& model_tags,
const inference::ModelMetrics& model_metrics)
{
std::map<std::string, std::string> labels;
GetMetricLabels(&labels, model_id, model_version, device, model_tags);

// Parse metrics config to control metric setup and behavior
config_.ParseConfig(response_cache_enabled, is_decoupled);
config_.ParseConfig(response_cache_enabled, is_decoupled, model_metrics);

// Initialize families and metrics
InitializeCounters(labels);
Expand Down Expand Up @@ -282,10 +305,11 @@ void
MetricModelReporter::InitializeHistograms(
const std::map<std::string, std::string>& labels)
{
// Update MetricReporterConfig::metric_map_ for new histograms.
// Only create response metrics if decoupled model to reduce metric output
if (config_.latency_histograms_enabled_) {
if (config_.is_decoupled_) {
histogram_families_["first_response_histogram"] =
histogram_families_[kFirstResponseHistogram] =
&Metrics::FamilyFirstResponseDuration();
}
}
Expand All @@ -294,8 +318,9 @@ MetricModelReporter::InitializeHistograms(
const auto& name = iter.first;
auto family_ptr = iter.second;
if (family_ptr) {
histograms_[name] = CreateMetric<prometheus::Histogram>(
*family_ptr, labels, config_.buckets_);
const auto& buckets = config_.histogram_options_[name];
histograms_[name] =
CreateMetric<prometheus::Histogram>(*family_ptr, labels, buckets);
}
}
}
Expand Down
25 changes: 20 additions & 5 deletions src/metric_model_reporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ struct ModelIdentifier;
struct MetricReporterConfig {
#ifdef TRITON_ENABLE_METRICS
// Parses Metrics::ConfigMap and sets fields if specified
void ParseConfig(bool response_cache_enabled, bool is_decoupled);
void ParseConfig(
bool response_cache_enabled, bool is_decoupled,
const inference::ModelMetrics& model_metrics);
// Parses pairs of quantiles "quantile1:error1, quantile2:error2, ..."
// and overwrites quantiles_ field if successful.
prometheus::Summary::Quantiles ParseQuantiles(std::string options);
Expand All @@ -57,10 +59,12 @@ struct MetricReporterConfig {
bool latency_histograms_enabled_ = false;
// Create and use Summaries for per-model latency related metrics
bool latency_summaries_enabled_ = false;
// Buckets used for any histogram metrics. Each value represents
// a bucket boundary. For example, {100, 500, 2000, 5000} are latencies
// Default bucket boundaries used for each histogram metric. Each value
// represents a boundary. For example, {100, 500, 2000, 5000} are latencies.
// in milliseconds in first_response_histogram.
prometheus::Histogram::BucketBoundaries buckets_ = {100, 500, 2000, 5000};
std::unordered_map<std::string, prometheus::Histogram::BucketBoundaries>
histogram_options_ = {{kFirstResponseHistogram, {100, 500, 2000, 5000}}};

// Quantiles used for any summary metrics. Each pair of values represents
// { quantile, error }. For example, {0.90, 0.01} means to compute the
// 90th percentile with 1% error on either side, so the approximate 90th
Expand All @@ -73,6 +77,14 @@ struct MetricReporterConfig {
bool cache_enabled_ = false;

bool is_decoupled_ = false;

private:
// Maps the metric family fullname to its lookup key. This field is required
// because the users are expected to configure metric configuration
// "ModelMetrics" with the full name displayed from metrics reporting while a
// different name is used internally. All new histograms must update the map.
const std::unordered_map<std::string, std::string> metric_map_ = {
{"nv_inference_first_response_histogram_ms", kFirstResponseHistogram}};
#endif // TRITON_ENABLE_METRICS
};

Expand All @@ -86,7 +98,9 @@ class MetricModelReporter {
const triton::core::ModelIdentifier& model_id,
const int64_t model_version, const int device,
bool response_cache_enabled, bool is_decoupled,
// FIXME: [DLIS-7497] Merge model_tags with model_metrics
const triton::common::MetricTagsMap& model_tags,
const inference::ModelMetrics& model_metrics,
std::shared_ptr<MetricModelReporter>* metric_model_reporter);

~MetricModelReporter();
Expand All @@ -112,7 +126,8 @@ class MetricModelReporter {
MetricModelReporter(
const ModelIdentifier& model_id, const int64_t model_version,
const int device, bool response_cache_enabled, bool is_decoupled,
const triton::common::MetricTagsMap& model_tags);
const triton::common::MetricTagsMap& model_tags,
const inference::ModelMetrics& model_metrics);

static void GetMetricLabels(
std::map<std::string, std::string>* labels,
Expand Down
14 changes: 8 additions & 6 deletions src/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,6 @@ Metrics::Metrics()
"execution per-model.")
.Register(*registry_)),

inf_first_response_histogram_ms_family_(
prometheus::BuildHistogram()
.Name("nv_inference_first_response_histogram_ms")
.Help("Duration from request to first response in milliseconds")
.Register(*registry_)),

model_load_time_family_(prometheus::BuildGauge()
.Name("nv_model_load_duration_secs")
.Help("Model load time in seconds")
Expand Down Expand Up @@ -155,6 +149,14 @@ Metrics::Metrics()
"microseconds")
.Register(*registry_)),

// Histograms
// New histograms must be added to MetricReporterConfig.metric_map_
inf_first_response_histogram_ms_family_(
prometheus::BuildHistogram()
.Name("nv_inference_first_response_histogram_ms")
.Help("Duration from request to first response in milliseconds")
.Register(*registry_)),

// Summaries
inf_request_summary_us_family_(
prometheus::BuildSummary()
Expand Down
6 changes: 4 additions & 2 deletions src/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,6 @@ class Metrics {
prometheus::Family<prometheus::Counter>&
inf_compute_output_duration_us_family_;
prometheus::Family<prometheus::Gauge>& inf_pending_request_count_family_;
prometheus::Family<prometheus::Histogram>&
inf_first_response_histogram_ms_family_;
prometheus::Family<prometheus::Gauge>& model_load_time_family_;

prometheus::Family<prometheus::Gauge>& pinned_memory_pool_total_family_;
Expand All @@ -330,6 +328,10 @@ class Metrics {
prometheus::Family<prometheus::Counter>& cache_num_misses_model_family_;
prometheus::Family<prometheus::Counter>& cache_miss_duration_us_model_family_;

// Histograms
prometheus::Family<prometheus::Histogram>&
inf_first_response_histogram_ms_family_;

// Summaries
prometheus::Family<prometheus::Summary>& inf_request_summary_us_family_;
prometheus::Family<prometheus::Summary>& inf_queue_summary_us_family_;
Expand Down
3 changes: 2 additions & 1 deletion src/model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ Model::Init(const bool is_config_provided)
#ifdef TRITON_ENABLE_METRICS
MetricModelReporter::Create(
ModelId(), Version(), METRIC_REPORTER_ID_UTILITY, ResponseCacheEnabled(),
IsDecoupled(), Config().metric_tags(), &reporter_);
IsDecoupled(), Config().metric_tags(), Config().model_metrics(),
&reporter_);
#endif // TRITON_ENABLE_METRICS

return Status::Success;
Expand Down
46 changes: 45 additions & 1 deletion src/model_config_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,39 @@ ValidateNonLinearFormatIO(
return Status::Success;
}

// Helper function to validate that model_metrics contains all required data.
Status
ValidateModelMetrics(const inference::ModelMetrics& model_metrics)
{
for (const auto& metric_control : model_metrics.metric_control()) {
if (!metric_control.has_metric_identifier()) {
return Status(
Status::Code::INVALID_ARG,
"metric control must specify 'metric_identifier'");
}

if (metric_control.metric_identifier().family().empty()) {
return Status(
Status::Code::INVALID_ARG,
"metric identifier must specify non-empty 'family'");
}

if (!metric_control.has_histogram_options()) {
return Status(
Status::Code::INVALID_ARG,
"metric control must specify 'histogram_options'");
}

if (metric_control.histogram_options().buckets_size() == 0) {
return Status(
Status::Code::INVALID_ARG,
"histogram options must specify non-empty 'buckets'");
}
}

return Status::Success;
}

} // namespace

Status
Expand Down Expand Up @@ -1591,7 +1624,7 @@ ValidateModelConfig(
}
}

// If ensemble scheduling is specified, validate it. Otherwise,
// If ensemble scheduling is specified, validate it. Otherwise,
// must validate platform and instance_group
if (config.has_ensemble_scheduling()) {
#ifdef TRITON_ENABLE_ENSEMBLE
Expand Down Expand Up @@ -1620,6 +1653,17 @@ ValidateModelConfig(
" cache.");
}

// If model_metric is specified, validate it.
if (config.has_model_metrics()) {
#ifdef TRITON_ENABLE_METRICS
Status status = ValidateModelMetrics(config.model_metrics());
if (!status.IsOk()) {
return status;
}
#else
return Status(Status::Code::INVALID_ARG, "metrics not supported");
#endif // TRITON_ENABLE_METRICS
}
return Status::Success;
}

Expand Down
Loading