From 9b91e63852a0804da50caa91b6acdd0bb5a94d3d Mon Sep 17 00:00:00 2001 From: Aki-07 Date: Sat, 18 Oct 2025 14:49:25 +0530 Subject: [PATCH 1/4] feat(eval): extend metric models and service --- src/app/core/models/Eval.ts | 20 ++++++++++++++++++++ src/app/core/services/eval.service.ts | 8 ++++++++ 2 files changed, 28 insertions(+) diff --git a/src/app/core/models/Eval.ts b/src/app/core/models/Eval.ts index 8b910a34..cdc0ff4b 100644 --- a/src/app/core/models/Eval.ts +++ b/src/app/core/models/Eval.ts @@ -23,6 +23,26 @@ export declare interface EvalMetric { metricName: string; threshold: number; + criterion?: unknown; +} + +export declare interface MetricValueInfo { + defaultThreshold?: number; + minThreshold?: number; + maxThreshold?: number; + step?: number; +} + +export declare interface MetricInfo { + metricName: string; + description?: string; + metricValueInfo?: MetricValueInfo; +} + +export declare interface EvalMetricConfig extends EvalMetric { + description?: string; + metricValueInfo?: MetricValueInfo; + criterion?: unknown; } export const DEFAULT_EVAL_METRICS: EvalMetric[] = [ diff --git a/src/app/core/services/eval.service.ts b/src/app/core/services/eval.service.ts index 8dfdd873..0defe869 100644 --- a/src/app/core/services/eval.service.ts +++ b/src/app/core/services/eval.service.ts @@ -88,6 +88,14 @@ export class EvalService { }); } + listMetricsInfo(appName: string) { + if (this.apiServerDomain != undefined) { + const url = this.apiServerDomain + `/apps/${appName}/metrics-info`; + return this.http.get(url, {}); + } + return new Observable(); + } + listEvalResults(appName: string) { if (this.apiServerDomain != undefined) { const url = this.apiServerDomain + `/apps/${appName}/eval_results`; From bf6f2f6c709527b45815ca2e2b9a510bbddea913 Mon Sep 17 00:00:00 2001 From: Aki-07 Date: Sat, 18 Oct 2025 14:49:33 +0530 Subject: [PATCH 2/4] feat(eval-tab): support dynamic metric selection and results --- .../eval-tab/eval-tab.component.html | 71 ++++- .../eval-tab/eval-tab.component.scss | 72 +++++ .../components/eval-tab/eval-tab.component.ts | 264 ++++++++++++++++-- 3 files changed, 386 insertions(+), 21 deletions(-) diff --git a/src/app/components/eval-tab/eval-tab.component.html b/src/app/components/eval-tab/eval-tab.component.html index fff5aac6..e7d4753d 100644 --- a/src/app/components/eval-tab/eval-tab.component.html +++ b/src/app/components/eval-tab/eval-tab.component.html @@ -63,6 +63,36 @@
@if (!showEvalHistory()) {
+
+ + Evaluation metrics + + @for (metric of metricOptions; track metric.metricName) { + + {{ metric.metricName }} + @if (metric.description) { + — {{ metric.description }} + } + + } + + + @if (selectedMetricNames.length === 0) { +
+ Select at least one metric before running an evaluation. +
+ } @else { +
+ @for (metric of metricOptions; track metric.metricName) { + @if (metric.selected) { + + {{ metric.metricName }} · threshold: {{ metric.threshold }} + + } + } +
+ } +
history @@ -128,9 +158,15 @@
@if (getEvalMetrics(evalResult)) {
- @for (evalMetric of getEvalMetrics(evalResult); track evalMetric) { - {{ evalMetric.metricName }}: - {{ evalMetric.threshold }} + @for (evalMetric of getEvalMetrics(evalResult); track evalMetric.metricName) { + + {{ evalMetric.metricName }} · Threshold: {{ evalMetric.threshold }} + @if (evalMetric.score !== undefined) { + · Score: {{ evalMetric.score }} + } + @if (evalMetric.evalStatus) { + · Status: {{ evalMetric.evalStatus }} + } }
@@ -155,6 +191,34 @@
{{ evalResult.finalEvalStatus == 1 ? "PASS": "FAIL"}}
+ @if (evalResult.overallEvalMetricResults?.length) { +
+ @for (metricResult of evalResult.overallEvalMetricResults; track metricResult.metricName) { +
+ + {{ metricResult.metricName }} + + @if (metricResult.score !== undefined) { + + Score: {{ metricResult.score }} + + } + @if (metricResult.threshold !== undefined) { + + Threshold: {{ metricResult.threshold }} + + } +
+ } +
+ }
} @@ -184,4 +248,3 @@ } - diff --git a/src/app/components/eval-tab/eval-tab.component.scss b/src/app/components/eval-tab/eval-tab.component.scss index dadc466c..85c89a90 100644 --- a/src/app/components/eval-tab/eval-tab.component.scss +++ b/src/app/components/eval-tab/eval-tab.component.scss @@ -165,6 +165,45 @@ width: 100%; } +.metric-selection { + display: flex; + flex-direction: column; + margin-top: 12px; + gap: 8px; +} + +.metric-select-field { + width: 100%; +} + +.metric-option-description { + color: var(--eval-tab-metric-option-description-color, #9aa0a6); + font-size: 12px; + margin-left: 4px; +} + +.metric-selection-helper { + color: var(--eval-tab-metric-selection-helper-color, #9aa0a6); + font-size: 12px; +} + +.selected-metric-summary { + display: flex; + flex-wrap: wrap; + gap: 6px; + font-size: 12px; + color: var(--eval-tab-selected-metric-summary-color, #9aa0a6); +} + +.selected-metric-chip { + background-color: var( + --eval-tab-selected-metric-chip-background, + rgba(255, 255, 255, 0.08) + ); + border-radius: 12px; + padding: 4px 8px; +} + .evaluation-history-icon { cursor: pointer; margin-top: 4px; @@ -271,6 +310,39 @@ width: 100%; margin-top: 15px; } + + &__metric-results { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-top: 8px; + } + + &__metric-result { + display: inline-flex; + align-items: center; + gap: 6px; + border-radius: 12px; + padding: 4px 8px; + background-color: var( + --eval-tab-status-card-metric-result-background, + rgba(255, 255, 255, 0.08) + ); + font-size: 12px; + color: var(--eval-tab-status-card-metric-result-color, #e8eaed); + + &--pass { + color: var(--eval-tab-status-card-metric-result-pass-color, #1e8e3e); + } + + &--fail { + color: var(--eval-tab-status-card-metric-result-fail-color, #d93025); + } + + &--neutral { + color: var(--eval-tab-status-card-metric-result-neutral-color, #9aa0a6); + } + } } .eval-spinner { diff --git a/src/app/components/eval-tab/eval-tab.component.ts b/src/app/components/eval-tab/eval-tab.component.ts index fb5f0332..3a8880af 100644 --- a/src/app/components/eval-tab/eval-tab.component.ts +++ b/src/app/components/eval-tab/eval-tab.component.ts @@ -23,7 +23,7 @@ import { MatTableDataSource, MatTable, MatColumnDef, MatHeaderCellDef, MatHeader import {BehaviorSubject, of} from 'rxjs'; import {catchError} from 'rxjs/operators'; -import {DEFAULT_EVAL_METRICS, EvalMetric, EvalCase} from '../../core/models/Eval'; +import {DEFAULT_EVAL_METRICS, EvalMetric, EvalCase, EvalMetricConfig, MetricValueInfo} from '../../core/models/Eval'; import {Session} from '../../core/models/Session'; import {Invocation} from '../../core/models/Eval'; import {EvalService, EVAL_SERVICE} from '../../core/services/eval.service'; @@ -37,6 +37,10 @@ import { MatIcon } from '@angular/material/icon'; import { MatTooltip } from '@angular/material/tooltip'; import { NgClass } from '@angular/common'; import { MatProgressSpinner } from '@angular/material/progress-spinner'; +import { MatFormField } from '@angular/material/form-field'; +import { MatLabel } from '@angular/material/form-field'; +import { MatSelect } from '@angular/material/select'; +import { MatOption } from '@angular/material/core'; interface EvaluationResult { @@ -50,6 +54,10 @@ interface EvaluationResult { sessionDetails: any; } +interface MetricOption extends EvalMetricConfig { + selected: boolean; +} + interface UIEvaluationResult { isToggled: boolean; evaluationResults: EvaluationResult[]; @@ -96,6 +104,10 @@ interface AppEvaluationResult { MatRowDef, MatRow, MatProgressSpinner, + MatFormField, + MatLabel, + MatSelect, + MatOption, ], }) export class EvalTabComponent implements OnInit, OnChanges { @@ -128,6 +140,9 @@ export class EvalTabComponent implements OnInit, OnChanges { evalRunning = signal(false); evalMetrics: EvalMetric[] = DEFAULT_EVAL_METRICS; + metricOptions: MetricOption[] = []; + selectedMetricNames: string[] = + DEFAULT_EVAL_METRICS.map((metric) => metric.metricName); // Key: evalSetId // Value: EvaluationResult[] @@ -151,10 +166,203 @@ export class EvalTabComponent implements OnInit, OnChanges { }); } + private loadMetricsInfo() { + const appName = this.appName(); + + if (!appName) { + this.initializeMetricOptions([]); + return; + } + + this.evalService.listMetricsInfo(appName) + .pipe(catchError(() => of({metrics_info: []}))) + .subscribe((response: any) => { + const metricsInfo = response?.metrics_info ?? []; + this.initializeMetricOptions(metricsInfo); + }); + } + + private initializeMetricOptions(rawMetrics: any[]) { + const previousOptions = + new Map(this.metricOptions.map((metric) => [metric.metricName, metric])); + const selectedNamesBefore = new Set(this.selectedMetricNames); + + const metrics: MetricOption[] = rawMetrics + .map((rawMetric: any) => { + const metricName = + rawMetric?.metricName ?? + rawMetric?.metric_name ?? + rawMetric?.name ?? ''; + if (!metricName) { + return null; + } + + const previous = + previousOptions.get(metricName); + const metricValueInfo = + this.normalizeMetricValueInfo( + rawMetric?.metricValueInfo ?? + rawMetric?.metric_value_info); + const threshold = + previous?.threshold ?? + this.findThreshold(metricName) ?? + this.getDefaultThreshold( + metricValueInfo); + const selected = + previous?.selected ?? + selectedNamesBefore.has( + metricName) ?? + this.isDefaultMetric(metricName); + + return { + metricName, + description: + rawMetric?.description ?? + previous?.description, + metricValueInfo, + threshold: + threshold ?? + this.getDefaultThreshold( + metricValueInfo), + selected: !!selected, + criterion: + previous?.criterion ?? + rawMetric?.criterion, + } as MetricOption; + }) + .filter((metric) => !!metric) as + MetricOption[]; + + if (metrics.length === 0) { + this.metricOptions = this.buildFallbackMetricOptions(); + } else { + this.metricOptions = metrics; + if (!this.metricOptions.some((metric) => metric.selected)) { + this.metricOptions.forEach((metric) => { + metric.selected = this.isDefaultMetric(metric.metricName); + }); + } + } + + this.selectedMetricNames = + this.metricOptions.filter((metric) => metric.selected) + .map((metric) => metric.metricName); + this.syncEvalMetricsFromOptions(); + this.changeDetectorRef.detectChanges(); + } + + private buildFallbackMetricOptions(): MetricOption[] { + return DEFAULT_EVAL_METRICS.map((metric) => ({ + metricName: metric.metricName, + threshold: metric.threshold, + selected: true, + description: '', + metricValueInfo: undefined, + })) as MetricOption[]; + } + + private normalizeMetricValueInfo(raw: any): MetricValueInfo|undefined { + if (!raw) { + return undefined; + } + + const toNumber = + (value: unknown|undefined): number|undefined => { + if (value === null || value === undefined) { + return undefined; + } + const parsed = Number(value); + return isNaN(parsed) ? undefined : parsed; + }; + + return { + defaultThreshold: toNumber( + raw.defaultThreshold ?? raw.default_threshold ?? raw.default_value ?? + raw.default), + minThreshold: toNumber( + raw.minThreshold ?? raw.min_threshold ?? raw.min_value ?? raw.min), + maxThreshold: toNumber( + raw.maxThreshold ?? raw.max_threshold ?? raw.max_value ?? raw.max), + step: toNumber(raw.step ?? raw.thresholdStep ?? raw.threshold_step), + }; + } + + private getDefaultThreshold(metricValueInfo: MetricValueInfo|undefined) { + return metricValueInfo?.defaultThreshold ?? 1; + } + + private findThreshold(metricName: string): number|undefined { + const existing = + this.evalMetrics.find((metric) => metric.metricName === metricName); + if (existing) { + return existing.threshold; + } + const fallback = + DEFAULT_EVAL_METRICS.find((metric) => metric.metricName === metricName); + return fallback?.threshold; + } + + private isDefaultMetric(metricName: string): boolean { + return DEFAULT_EVAL_METRICS.some( + (metric) => metric.metricName === metricName); + } + + protected onMetricSelectionChange(selected: string[]) { + this.selectedMetricNames = selected; + const selectedSet = new Set(selected); + + this.metricOptions.forEach((metric) => { + metric.selected = selectedSet.has(metric.metricName); + if (metric.selected && + (metric.threshold === undefined || metric.threshold === null)) { + metric.threshold = this.getDefaultThreshold(metric.metricValueInfo); + } + }); + + this.syncEvalMetricsFromOptions(); + } + + private syncEvalMetricsFromOptions() { + const selectedMetrics = + this.metricOptions.filter((metric) => metric.selected); + + this.evalMetrics = selectedMetrics.map((metric) => { + return { + metricName: metric.metricName, + threshold: metric.threshold, + ...(metric.criterion ? {criterion: metric.criterion} : {}), + } as EvalMetric; + }); + } + + private cloneSelectedMetricOptions(): MetricOption[] { + return this.metricOptions.filter((metric) => metric.selected) + .map((metric) => { + return {...metric}; + }); + } + + private applyUpdatedMetricOptions(updatedMetrics: MetricOption[]) { + const updatedMap = + new Map(updatedMetrics.map((metric) => [metric.metricName, metric])); + + this.metricOptions.forEach((metric) => { + const updated = updatedMap.get(metric.metricName); + if (updated) { + metric.threshold = updated.threshold; + metric.criterion = updated.criterion; + } + }); + + this.syncEvalMetricsFromOptions(); + this.changeDetectorRef.detectChanges(); + } + ngOnChanges(changes: SimpleChanges): void { if (changes['appName']) { this.selectedEvalSet = ''; this.evalCases = []; + this.loadMetricsInfo(); this.getEvalSet(); this.getEvaluationResult(); } @@ -232,12 +440,16 @@ export class EvalTabComponent implements OnInit, OnChanges { } runEval() { - this.evalRunning.set(true); if (this.selection.selected.length == 0) { alert('No case selected!'); - this.evalRunning.set(false); return; } + if (this.evalMetrics.length === 0) { + alert('No metric selected!'); + return; + } + + this.evalRunning.set(true); this.evalService .runEval( this.appName(), @@ -345,6 +557,7 @@ export class EvalTabComponent implements OnInit, OnChanges { private addEvalFieldsToBotEvent( event: any, invocationResult: any, failedMetric: string, score: number, threshold: number) { + event.metricResults = invocationResult.evalMetricResults ?? []; event.failedMetric = failedMetric; event.evalScore = score; event.evalThreshold = threshold; @@ -553,19 +766,23 @@ export class EvalTabComponent implements OnInit, OnChanges { alert('No case selected!'); return; } + if (this.metricOptions.filter((metric) => metric.selected).length === 0) { + alert('No metric selected!'); + return; + } const dialogRef = this.dialog.open(RunEvalConfigDialogComponent, { maxWidth: '90vw', maxHeight: '90vh', data: { - evalMetrics: this.evalMetrics, + metrics: this.cloneSelectedMetricOptions(), }, }); - dialogRef.afterClosed().subscribe((evalMetrics) => { - if (!!evalMetrics) { - this.evalMetrics = evalMetrics; - + dialogRef.afterClosed().subscribe((updatedMetrics: MetricOption[]| + null|undefined) => { + if (!!updatedMetrics && updatedMetrics.length > 0) { + this.applyUpdatedMetricOptions(updatedMetrics); this.runEval(); } }); @@ -574,28 +791,41 @@ export class EvalTabComponent implements OnInit, OnChanges { protected getEvalMetrics(evalResult: any|undefined) { if (!evalResult || !evalResult.evaluationResults || !evalResult.evaluationResults.evaluationResults) { - return this.evalMetrics; + return this.metricOptions.filter((metric) => metric.selected) + .map((metric) => ({ + metricName: metric.metricName, + threshold: metric.threshold, + score: undefined, + evalStatus: undefined, + })); } const results = evalResult.evaluationResults.evaluationResults; if (results.length === 0) { - return this.evalMetrics; + return this.metricOptions.filter((metric) => metric.selected) + .map((metric) => ({ + metricName: metric.metricName, + threshold: metric.threshold, + score: undefined, + evalStatus: undefined, + })); } if (typeof results[0].overallEvalMetricResults === 'undefined' || !results[0].overallEvalMetricResults || results[0].overallEvalMetricResults.length === 0) { - return this.evalMetrics; + return this.metricOptions.filter((metric) => metric.selected) + .map((metric) => ({ + metricName: metric.metricName, + threshold: metric.threshold, + score: undefined, + evalStatus: undefined, + })); } const overallEvalMetricResults = results[0].overallEvalMetricResults; - return overallEvalMetricResults.map((result: any) => { - return { - metricName: result.metricName, - threshold: result.threshold, - }; - }); + return overallEvalMetricResults; } } From a47d1147a5300d7f49df5c69ef705fc6e32662e3 Mon Sep 17 00:00:00 2001 From: Aki-07 Date: Sat, 18 Oct 2025 14:49:44 +0530 Subject: [PATCH 3/4] feat(eval-dialog): configure thresholds for selected metrics --- .../run-eval-config-dialog.component.html | 65 ++++--- .../run-eval-config-dialog.component.scss | 55 ++++-- .../run-eval-config-dialog.component.spec.ts | 51 +++--- .../run-eval-config-dialog.component.ts | 163 +++++++++++++----- 4 files changed, 230 insertions(+), 104 deletions(-) diff --git a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.html b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.html index eb8623b1..637b0b04 100644 --- a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.html +++ b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.html @@ -14,34 +14,47 @@ limitations under the License. -->
-

EVALUATION METRIC

+

Evaluation Metrics

-
-
Tool trajectory avg score:
-
- - - - - - {{ evalForm.controls['tool_trajectory_avg_score_threshold'].value }} - -
-
- -
-
Response match score:
-
- - - - - - {{ evalForm.controls['response_match_score_threshold'].value }} - -
-
+ @if (metrics.length === 0) { +
No metrics available for this app.
+ } @else { + @for (metric of metrics; track metric.metricName) { +
+
+
{{ metric.metricName }}
+ @if (metric.description) { +
{{ metric.description }}
+ } + @if (formatRangeDescription(metric) || formatStepDescription(metric)) { +
+ {{ formatRangeDescription(metric) }} + @if (formatStepDescription(metric)) { + · {{ formatStepDescription(metric) }} + } +
+ } +
+
+ + Threshold + + + @if (hasError(metric)) { +
{{ getErrorMessage(metric) }}
+ } +
+
+ } + }
diff --git a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.scss b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.scss index 8a8b3af5..8f6988d0 100644 --- a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.scss +++ b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.scss @@ -1,34 +1,55 @@ .dialog-container { border-radius: 12px; padding: 18px; - width:500px; + width: 520px; box-shadow: 0 8px 16px var(--run-eval-config-dialog-container-box-shadow-color); } -.threshold-slider { - --mdc-slider-active-track-color: var(--run-eval-config-dialog-threshold-slider-active-track-color); - --mdc-slider-inactive-track-color: var(--run-eval-config-dialog-threshold-slider-inactive-track-color); - --mdc-slider-handle-color: var(--run-eval-config-dialog-threshold-slider-handle-color); - --mdc-slider-ripple-color: var(--run-eval-config-dialog-threshold-slider-ripple-color); - width: 100px +.eval-form { + display: flex; + flex-direction: column; + gap: 16px; } .metric-row { display: flex; flex-direction: row; - align-items: center; + gap: 16px; + align-items: flex-start; +} + +.metric-details { + flex: 1; + display: flex; + flex-direction: column; + gap: 4px; +} + +.metric-title { + font-weight: 500; } -.metric-name { - width: 250px; +.metric-description { + font-size: 13px; + color: var(--run-eval-config-dialog-metric-description-color, #9aa0a6); } -.threshold-value { - margin-left: 20px; +.metric-hints { + font-size: 12px; + color: var(--run-eval-config-dialog-metric-hints-color, #9aa0a6); } -.mdc-slider__thumb--with-indicator { - background-color: var(--mdc-slider-handle-color, var(--run-eval-config-dialog-mdc-slider-thumb-background-color)); - border: none !important; - box-shadow: none !important; -} \ No newline at end of file +.metric-input { + width: 180px; +} + +.metric-error { + margin-top: 4px; + font-size: 12px; + color: var(--run-eval-config-dialog-metric-error-color, #d93025); +} + +.no-metrics { + font-size: 14px; + color: var(--run-eval-config-dialog-no-metrics-color, #9aa0a6); +} diff --git a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.spec.ts b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.spec.ts index 7a9bbbf7..062ddb03 100644 --- a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.spec.ts +++ b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.spec.ts @@ -22,8 +22,6 @@ import { MatDialogModule, MatDialogRef, } from '@angular/material/dialog'; -import {MatRadioModule} from '@angular/material/radio'; -import {MatSliderModule} from '@angular/material/slider'; import {NoopAnimationsModule} from '@angular/platform-browser/animations'; @@ -42,8 +40,6 @@ describe('RunEvalConfigDialogComponent', () => { imports: [ ReactiveFormsModule, MatDialogModule, - MatRadioModule, - MatSliderModule, NoopAnimationsModule, RunEvalConfigDialogComponent, ], @@ -52,14 +48,24 @@ describe('RunEvalConfigDialogComponent', () => { { provide: MAT_DIALOG_DATA, useValue: { - evalMetrics: [ + metrics: [ { metricName: 'tool_trajectory_avg_score', threshold: 1, + metricValueInfo: { + minThreshold: 0, + maxThreshold: 1, + step: 0.1, + }, }, { metricName: 'response_match_score', threshold: 0.7, + metricValueInfo: { + minThreshold: 0, + maxThreshold: 1, + step: 0.1, + }, }, ], }, @@ -93,23 +99,26 @@ describe('RunEvalConfigDialogComponent', () => { expect(dialogRefSpy.close).toHaveBeenCalledWith(null); }); - it('should update threshold value when slider changes (simulated)', () => { - const toolTrajectoryAvgScoreSlider = component.evalForm.get( - 'tool_trajectory_avg_score_threshold' - )!; - const responseMatchScoreSlider = component.evalForm.get( - 'response_match_score_threshold' - )!; + it('should close dialog with updated thresholds on start', () => { + const toolControl = + component.evalForm.get('tool_trajectory_avg_score_threshold')!; + const responseControl = + component.evalForm.get('response_match_score_threshold')!; - toolTrajectoryAvgScoreSlider.setValue(0.4); // Simulate slider value change - responseMatchScoreSlider.setValue(0.5); // Simulate slider value change - fixture.detectChanges(); + toolControl.setValue(0.4); + responseControl.setValue(0.5); - expect(toolTrajectoryAvgScoreSlider.value).toBe(0.4); - expect(responseMatchScoreSlider.value).toBe(0.5); - const thresholdValueDisplays = - fixture.nativeElement.querySelectorAll('.threshold-value'); - expect(thresholdValueDisplays[0].textContent).toContain('0.4'); - expect(thresholdValueDisplays[1].textContent).toContain('0.5'); + component.onStart(); + + expect(dialogRefSpy.close).toHaveBeenCalledWith([ + jasmine.objectContaining({ + metricName: 'tool_trajectory_avg_score', + threshold: 0.4, + }), + jasmine.objectContaining({ + metricName: 'response_match_score', + threshold: 0.5, + }), + ]); }); }); diff --git a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.ts b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.ts index 94ef2199..5619e35c 100644 --- a/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.ts +++ b/src/app/components/eval-tab/run-eval-config-dialog/run-eval-config-dialog.component.ts @@ -16,13 +16,16 @@ */ import {Component, Inject} from '@angular/core'; -import { FormBuilder, FormGroup, Validators, FormsModule, ReactiveFormsModule } from '@angular/forms'; +import {FormBuilder, FormGroup, Validators, FormsModule, ReactiveFormsModule} from '@angular/forms'; import { MAT_DIALOG_DATA, MatDialogRef, MatDialogTitle, MatDialogContent, MatDialogActions } from '@angular/material/dialog'; -import {EvalMetric} from '../../../core/models/Eval'; +import {EvalMetricConfig} from '../../../core/models/Eval'; import { CdkScrollable } from '@angular/cdk/scrolling'; -import { MatSlider, MatSliderThumb } from '@angular/material/slider'; import { MatButton } from '@angular/material/button'; +import { MatFormField } from '@angular/material/form-field'; +import { MatLabel } from '@angular/material/form-field'; +import { MatInput } from '@angular/material/input'; +import { NgIf, NgFor } from '@angular/common'; /** * @interface EvalConfigData @@ -30,7 +33,7 @@ import { MatButton } from '@angular/material/button'; * evaluation metrics. */ export interface EvalConfigData { - evalMetrics: EvalMetric[]; + metrics: EvalMetricConfig[]; } @Component({ @@ -43,17 +46,21 @@ export interface EvalConfigData { MatDialogContent, FormsModule, ReactiveFormsModule, - MatSlider, - MatSliderThumb, MatDialogActions, MatButton, + MatFormField, + MatLabel, + MatInput, + NgIf, + NgFor, ], }) export class RunEvalConfigDialogComponent { // FormGroup to manage the dialog's form controls evalForm: FormGroup; - evalMetrics: EvalMetric[] = []; + metrics: EvalMetricConfig[] = []; + private controlNameByMetric = new Map(); /** * @constructor @@ -68,48 +75,124 @@ export class RunEvalConfigDialogComponent { public dialogRef: MatDialogRef, private fb: FormBuilder, @Inject(MAT_DIALOG_DATA) public data: EvalConfigData) { - this.evalMetrics = this.data.evalMetrics; - - // Initialize the form with controls and validators - this.evalForm = this.fb.group({ - tool_trajectory_avg_score_threshold: [ - this.getEvalMetricThresholdFromData('tool_trajectory_avg_score'), - [Validators.required, Validators.min(0), Validators.max(1)] - ], - response_match_score_threshold: [ - this.getEvalMetricThresholdFromData('response_match_score'), - [Validators.required, Validators.min(0), Validators.max(1)] - ] - }); + this.metrics = this.data.metrics ?? []; + + this.evalForm = this.fb.group({}); + this.initializeForm(); + } + + protected getControlName(metricName: string): string { + return this.controlNameByMetric.get(metricName) ?? ''; + } + + protected getMin(metric: EvalMetricConfig): number|undefined { + return metric.metricValueInfo?.minThreshold; } - private getEvalMetricThresholdFromData(metricName: string): number { - return this.evalMetrics.find((metric) => metric.metricName === metricName) - ?.threshold ?? - 0; + protected getMax(metric: EvalMetricConfig): number|undefined { + return metric.metricValueInfo?.maxThreshold; + } + + protected getStep(metric: EvalMetricConfig): number|undefined { + return metric.metricValueInfo?.step; + } + + private initializeForm() { + for (const metric of this.metrics) { + const controlName = this.createControlName(metric.metricName); + this.controlNameByMetric.set(metric.metricName, controlName); + + const validators = [Validators.required]; + const min = this.getMin(metric); + if (min !== undefined) { + validators.push(Validators.min(min)); + } + const max = this.getMax(metric); + if (max !== undefined) { + validators.push(Validators.max(max)); + } + + this.evalForm.addControl(controlName, this.fb.control( + metric.threshold, + validators)); + } + } + + private createControlName(metricName: string): string { + const sanitized = metricName.replace(/[^a-zA-Z0-9]/g, '_'); + return `${sanitized}_threshold`; } onStart(): void { if (this.evalForm.valid) { - const { - tool_trajectory_avg_score_threshold, - response_match_score_threshold - } = this.evalForm.value; - - for (const metric of this.evalMetrics) { - if (metric.metricName === 'tool_trajectory_avg_score') { - metric.threshold = tool_trajectory_avg_score_threshold; - } else if (metric.metricName === 'response_match_score') { - metric.threshold = response_match_score_threshold; - } - } + this.metrics = this.metrics.map((metric) => { + const controlName = this.getControlName(metric.metricName); + const value = this.evalForm.get(controlName)?.value; + return { + ...metric, + threshold: Number(value), + }; + }); + + this.dialogRef.close(this.metrics); + + return; + } + + this.evalForm.markAllAsTouched(); + } + + protected hasError(metric: EvalMetricConfig): boolean { + const control = this.evalForm.get(this.getControlName(metric.metricName)); + return !!control && control.invalid && (control.dirty || control.touched); + } - this.dialogRef.close(this.evalMetrics); + protected getErrorMessage(metric: EvalMetricConfig): string { + const control = this.evalForm.get(this.getControlName(metric.metricName)); + if (!control || !control.errors) { + return ''; } + if (control.errors['min']) { + const min = this.getMin(metric); + return `Minimum threshold is ${min}`; + } + if (control.errors['max']) { + const max = this.getMax(metric); + return `Maximum threshold is ${max}`; + } + if (control.errors['required']) { + return 'Threshold is required'; + } + return 'Invalid threshold'; } - onCancel(): void { - this.dialogRef.close( - null); // Return null or undefined to indicate cancellation + protected formatRangeDescription(metric: EvalMetricConfig): string { + const min = this.getMin(metric); + const max = this.getMax(metric); + if (min === undefined && max === undefined) { + return ''; + } + if (min !== undefined && max !== undefined) { + return `Range ${min} – ${max}`; + } + if (min !== undefined) { + return `≥ ${min}`; + } + if (max !== undefined) { + return `≤ ${max}`; + } + return ''; } + + protected formatStepDescription(metric: EvalMetricConfig): string { + const step = this.getStep(metric); + if (step === undefined) { + return ''; + } + return `Step ${step}`; + } + + onCancel(): void { + this.dialogRef.close(null); + } } From c8889793e967e62840bd16c782f6456a5aa0729f Mon Sep 17 00:00:00 2001 From: Aki-07 Date: Sat, 18 Oct 2025 14:50:00 +0530 Subject: [PATCH 4/4] feat(chat-panel): surface per-metric evaluation results --- .../chat-panel/chat-panel.component.html | 24 ++++++++++++++ .../chat-panel/chat-panel.component.scss | 33 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/app/components/chat-panel/chat-panel.component.html b/src/app/components/chat-panel/chat-panel.component.html index 97664029..e968992e 100644 --- a/src/app/components/chat-panel/chat-panel.component.html +++ b/src/app/components/chat-panel/chat-panel.component.html @@ -209,6 +209,30 @@ }
} + @if (message.metricResults?.length) { +
+ @for (metricResult of message.metricResults; track metricResult.metricName) { + + {{ metricResult.metricName }} + @if (metricResult.score !== undefined) { + Score: {{ metricResult.score }} + } + @if (metricResult.threshold !== undefined) { + Threshold: {{ metricResult.threshold }} + } + + } +
+ } } @if (message.functionCall) {