Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def main(cfg_path='config.yaml'):
raise

# 4. Load models and initialize central request controller
central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_properties", {}))
central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_settings", {}))

# 5. Expand task-metric pairs
task_payload = expand_task_metric_pairs(run_config, task_configs, task_ancestry)
Expand Down
2 changes: 1 addition & 1 deletion sample_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ filter:
num_samples: 100 # number of samples to run(remove for all)
length_filter: [0.0, 30.0] #optional - filters for only audio samples in this length(seconds) - only supported for general and callhome preprocessors

judge_properties:
judge_settings:
judge_concurrency: 8 #judge call(optional)
judge_model: "gpt-4o-mini" #optional
judge_type: "openai" # mandatory (vllm or openai)
Expand Down
21 changes: 21 additions & 0 deletions tasks/phonetics/phonemes/voxangeles_articulation_manner.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
task_name: voxangeles_articulation_manner
dataset_path: DynamicSuperb/PhonologicalFeatureClassification_VoxAngeles-MannerOfArticulation
modality: audio
language: en
split: test
preprocessor: GeneralPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio
target_column: label
instruction_column: instruction
user_prompt: "You are an audio model that is an expert at identifying phonemes and articulation manner. You always give your best attempt at trying to answer the question. The short audio clip is given to you after the following instructions. Always attempt to answer."

long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.0001
max_completion_tokens: 64

metrics:
- metric: llm_judge_binary
- metric: detailed_judge_prompt
20 changes: 20 additions & 0 deletions tasks/phonetics/phonemes/voxangeles_phoneme_classification.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
task_name: voxangeles_phoneme_classification
dataset_path: DynamicSuperb/PhonologicalFeatureClassification_VoxAngeles-Phone
modality: audio
language: en
split: test
preprocessor: GeneralPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio
target_column: label
instruction_column: instruction
user_prompt: "You are an audio model that is an expert at identifying phonemes. Listen to the given audio clip. Based on the following instructions, answer the question. Always attempt to answer."
long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.0001
max_completion_tokens: 64

metrics:
- metric: llm_judge_binary
- metric: detailed_judge_prompt
20 changes: 20 additions & 0 deletions tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
task_name: voxangeles_phoneme_counting
dataset_path: DynamicSuperb/PhoneSegmentCounting_VoxAngeles
modality: audio
language: en
split: test
preprocessor: GeneralPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio
target_column: label
instruction_column: instruction
user_prompt: "You are an expert at counting phones in the context of phonemes, and always attempt to answer. You will be given an audio sample, listen carefully."
long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.0001
max_completion_tokens: 64

metrics:
- metric: llm_judge_binary
- metric: detailed_judge_prompt
19 changes: 19 additions & 0 deletions tasks/speech_disorder/voice_disorder/sep_28k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
task_name: stuttering_detection
dataset_path: DynamicSuperb/StutteringDetection_SEP28k
modality: audio
language: en
split: test
preprocessor: GeneralPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio
target_column: label
instruction_column: instruction
long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.0001
max_completion_tokens: 64

metrics:
- metric: llm_judge_binary
- metric: detailed_judge_prompt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
task_name: noise_detection
dataset_path: NoiseDetection_LJSpeech_MUSAN-Gaussian
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure the dataset_path is the correct path from HF

modality: audio
language: en
split: test
preprocessor: GeneralPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio
target_column: label
instruction_column: instruction
long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.0001
max_completion_tokens: 64

metrics:
- metric: llm_judge_binary
- metric: detailed_judge_prompt
20 changes: 10 additions & 10 deletions utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ def validate_config(config: dict, task_configs: dict[Path, list[dict]]) -> Dict:
raise ValueError("'filters' must be a dictionary")
_validate_filter_values(config['filters'])

# Validate judge_properties as a dictionary
logger.info("---------Validating judge properties---------")
if 'judge_properties' in config:
if not isinstance(config['judge_properties'], dict):
raise ValueError("'judge_properties' must be a dictionary")
_validate_judge_properties(config['judge_properties'])
# Validate judge_settings as a dictionary
logger.info("---------Validating judge settings---------")
if 'judge_settings' in config:
if not isinstance(config['judge_settings'], dict):
raise ValueError("'judge_settings' must be a dictionary")
_validate_judge_settings(config['judge_settings'])

# Delegate validation for complex sections
logger.info("---------Validating models---------")
Expand Down Expand Up @@ -175,11 +175,11 @@ def _validate_filter_values(filters: Dict) -> None:
raise ValueError("'language' must be a string")


def _validate_judge_properties(judge_props: Dict) -> None:
"""Validate the values in the judge_properties dictionary.
def _validate_judge_settings(judge_props: Dict) -> None:
"""Validate the values in the judge_settings dictionary.

Args:
judge_props: Dictionary of judge properties to validate
judge_props: Dictionary of judge settings to validate

Raises:
ValueError: If any judge property is invalid
Expand Down