ServiceNow · pcsid · Dec 19, 2025 · nhhoang96 · Dec 24, 2025
diff --git a/evaluate.py b/evaluate.py
@@ -44,7 +44,7 @@ def main(cfg_path='config.yaml'):
         raise
 
     # 4. Load models and initialize central request controller
-    central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_properties", {}))
+    central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_settings", {}))
 
     # 5. Expand task-metric pairs
     task_payload = expand_task_metric_pairs(run_config, task_configs, task_ancestry)

diff --git a/sample_config.yaml b/sample_config.yaml
@@ -25,7 +25,7 @@ filter:
   num_samples: 100 # number of samples to run(remove for all)
   length_filter: [0.0, 30.0] #optional - filters for only audio samples in this length(seconds) - only supported for general and callhome preprocessors
 
-judge_properties:
+judge_settings:
   judge_concurrency: 8 #judge call(optional)
   judge_model: "gpt-4o-mini" #optional
   judge_type: "openai" # mandatory (vllm or openai)

diff --git a/tasks/phonetics/phonemes/voxangeles_articulation_manner.yaml b/tasks/phonetics/phonemes/voxangeles_articulation_manner.yaml
@@ -0,0 +1,21 @@
+task_name: voxangeles_articulation_manner
+dataset_path: DynamicSuperb/PhonologicalFeatureClassification_VoxAngeles-MannerOfArticulation
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+user_prompt: "You are an audio model that is an expert at identifying phonemes and articulation manner. You always give your best attempt at trying to answer the question. The short audio clip is given to you after the following instructions. Always attempt to answer."
+
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/tasks/phonetics/phonemes/voxangeles_phoneme_classification.yaml b/tasks/phonetics/phonemes/voxangeles_phoneme_classification.yaml
@@ -0,0 +1,20 @@
+task_name: voxangeles_phoneme_classification
+dataset_path: DynamicSuperb/PhonologicalFeatureClassification_VoxAngeles-Phone
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+user_prompt: "You are an audio model that is an expert at identifying phonemes. Listen to the given audio clip. Based on the following instructions, answer the question. Always attempt to answer."
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml b/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml
@@ -0,0 +1,20 @@
+task_name: voxangeles_phoneme_counting
+dataset_path: DynamicSuperb/PhoneSegmentCounting_VoxAngeles
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+user_prompt: "You are an expert at counting phones in the context of phonemes, and always attempt to answer. You will be given an audio sample, listen carefully."
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/tasks/speech_disorder/voice_disorder/sep_28k.yaml b/tasks/speech_disorder/voice_disorder/sep_28k.yaml
@@ -0,0 +1,19 @@
+task_name: stuttering_detection
+dataset_path: DynamicSuperb/StutteringDetection_SEP28k
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml b/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml
@@ -0,0 +1,19 @@
+task_name: noise_detection
+dataset_path: NoiseDetection_LJSpeech_MUSAN-Gaussian
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/utils/util.py b/utils/util.py
@@ -85,12 +85,12 @@ def validate_config(config: dict, task_configs: dict[Path, list[dict]]) -> Dict:
                 raise ValueError("'filters' must be a dictionary")
             _validate_filter_values(config['filters'])
 
-        # Validate judge_properties as a dictionary
-        logger.info("---------Validating judge properties---------")
-        if 'judge_properties' in config:
-            if not isinstance(config['judge_properties'], dict):
-                raise ValueError("'judge_properties' must be a dictionary")
-            _validate_judge_properties(config['judge_properties'])
+        # Validate judge_settings as a dictionary
+        logger.info("---------Validating judge settings---------")
+        if 'judge_settings' in config:
+            if not isinstance(config['judge_settings'], dict):
+                raise ValueError("'judge_settings' must be a dictionary")
+            _validate_judge_settings(config['judge_settings'])
 
         # Delegate validation for complex sections
         logger.info("---------Validating models---------")
@@ -175,11 +175,11 @@ def _validate_filter_values(filters: Dict) -> None:
         raise ValueError("'language' must be a string")
 
 
-def _validate_judge_properties(judge_props: Dict) -> None:
-    """Validate the values in the judge_properties dictionary.
-    
+def _validate_judge_settings(judge_props: Dict) -> None:
+    """Validate the values in the judge_settings dictionary.
+
     Args:
-        judge_props: Dictionary of judge properties to validate
+        judge_props: Dictionary of judge settings to validate
 
     Raises:
         ValueError: If any judge property is invalid