Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(js/plugins/checks): checks evaluator plugin returns multiple scores #1370

Merged
merged 11 commits into from
Dec 11, 2024
Merged
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ js/testapps/firebase-functions-sample1/.firebase
js/testapps/firebase-functions-sample1/.firebaserc
js/testapps/firebase-functions-sample1/public/bundle.js
js/testapps/firebase-functions-sample1/public/config.js
.genkit
js/**/.genkit
samples/**/.genkit
go/**/.genkit
Expand Down
24 changes: 24 additions & 0 deletions js/plugins/checks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,32 @@ Create a JSON file with the data you want to test. Add as many test cases as you
### Run the evaluators

```bash
# Run all configured classifiers.
genkit eval:run test-dataset.json --evaluators=checks/all_metrics

# Run just the DANGEROUS_CONTENT classifier.
genkit eval:run test-dataset.json --evaluators=checks/dangerous_content

# Run just the HARASSMENT classifier.
genkit eval:run test-dataset.json --evaluators=checks/harassment

# Run just the HATE_SPEECH classifier.
genkit eval:run test-dataset.json --evaluators=checks/hate_speech

# Run just the MEDICAL_INFO classifier.
genkit eval:run test-dataset.json --evaluators=checks/medical_info

# Run just the OBSCENITY_AND_PROFANITY classifier.
genkit eval:run test-dataset.json --evaluators=checks/obscenity_and_profanity

# Run just the PII_SOLICITING_RECITING classifier.
genkit eval:run test-dataset.json --evaluators=checks/pii_soliciting_reciting

# Run just the SEXUALLY_EXPLICIT classifier.
genkit eval:run test-dataset.json --evaluators=checks/sexually_explicit

# Run just the VIOLENCE_AND_GORE classifier.
genkit eval:run test-dataset.json --evaluators=checks/violence_and_gore
```

```bash
Expand Down
50 changes: 34 additions & 16 deletions js/plugins/checks/src/evaluation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,22 @@ export function checksEvaluators(
}
);

// Individual evaluators, one per configured metric.
const evaluators = policy_configs.map((policy_config) => {
return createPolicyEvaluator(projectId, auth, ai, policy_config);
return createPolicyEvaluator(
projectId,
auth,
ai,
[policy_config],
policy_config.type as string
);
});

// Single evaluator instnace with all configured policies.
evaluators.push(
createPolicyEvaluator(projectId, auth, ai, policy_configs, 'all_metrics')
);

return evaluators;
}

Expand All @@ -104,15 +116,14 @@ function createPolicyEvaluator(
projectId: string,
auth: GoogleAuth,
ai: Genkit,
policy_config: ChecksEvaluationMetricConfig
policy_config: ChecksEvaluationMetricConfig[],
HunterHeston marked this conversation as resolved.
Show resolved Hide resolved
name: string
): EvaluatorAction {
const policyType = policy_config.type as string;

return ai.defineEvaluator(
{
name: `checks/${policyType.toLowerCase()}`,
displayName: policyType,
definition: `Evaluates text against the Checks ${policyType} policy.`,
name: `checks/${name.toLowerCase()}`,
displayName: name,
definition: `Evaluates text against the Checks ${name} policy.`,
},
async (datapoint: BaseEvalDataPoint) => {
const partialRequest = {
Expand All @@ -121,10 +132,12 @@ function createPolicyEvaluator(
content: datapoint.output as string,
},
},
policies: {
policy_type: policy_config.type,
threshold: policy_config.threshold,
},
policies: policy_config.map((config) => {
return {
policy_type: config.type,
threshold: config.threshold,
};
}),
};

const response = await checksEvalInstance(
Expand All @@ -134,13 +147,18 @@ function createPolicyEvaluator(
ResponseSchema
);

return {
evaluation: {
score: response.policyResults[0].score,
const evaluationResults = response.policyResults.map((result) => {
return {
id: result.policyType,
score: result.score,
details: {
reasoning: response.policyResults[0].violationResult,
reasoning: `Status ${result.violationResult}`,
},
},
};
});

return {
evaluation: evaluationResults,
testCaseId: datapoint.testCaseId,
};
}
Expand Down
Loading