diff --git a/evaluate.py b/evaluate.py index 8c4a0f7..73f5487 100644 --- a/evaluate.py +++ b/evaluate.py @@ -44,7 +44,7 @@ def main(cfg_path='config.yaml'): raise # 4. Load models and initialize central request controller - central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_properties", {})) + central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_settings", {})) # 5. Expand task-metric pairs task_payload = expand_task_metric_pairs(run_config, task_configs, task_ancestry) diff --git a/sample_config.yaml b/sample_config.yaml index e4c2cfe..16db45c 100644 --- a/sample_config.yaml +++ b/sample_config.yaml @@ -25,7 +25,7 @@ filter: num_samples: 100 # number of samples to run(remove for all) length_filter: [0.0, 30.0] #optional - filters for only audio samples in this length(seconds) - only supported for general and callhome preprocessors -judge_properties: +judge_settings: judge_concurrency: 8 #judge call(optional) judge_model: "gpt-4o-mini" #optional judge_type: "openai" # mandatory (vllm or openai) diff --git a/tasks/phonetics/phonemes/voxangeles_articulation_manner.yaml b/tasks/phonetics/phonemes/voxangeles_articulation_manner.yaml new file mode 100644 index 0000000..a3e83de --- /dev/null +++ b/tasks/phonetics/phonemes/voxangeles_articulation_manner.yaml @@ -0,0 +1,21 @@ +task_name: voxangeles_articulation_manner +dataset_path: DynamicSuperb/PhonologicalFeatureClassification_VoxAngeles-MannerOfArticulation +modality: audio +language: en +split: test +preprocessor: GeneralPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio +target_column: label +instruction_column: instruction +user_prompt: "You are an audio model that is an expert at identifying phonemes and articulation manner. You always give your best attempt at trying to answer the question. The short audio clip is given to you after the following instructions. Always attempt to answer." + +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.0001 + max_completion_tokens: 64 + +metrics: + - metric: llm_judge_binary + - metric: detailed_judge_prompt diff --git a/tasks/phonetics/phonemes/voxangeles_phoneme_classification.yaml b/tasks/phonetics/phonemes/voxangeles_phoneme_classification.yaml new file mode 100644 index 0000000..a4ddb74 --- /dev/null +++ b/tasks/phonetics/phonemes/voxangeles_phoneme_classification.yaml @@ -0,0 +1,20 @@ +task_name: voxangeles_phoneme_classification +dataset_path: DynamicSuperb/PhonologicalFeatureClassification_VoxAngeles-Phone +modality: audio +language: en +split: test +preprocessor: GeneralPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio +target_column: label +instruction_column: instruction +user_prompt: "You are an audio model that is an expert at identifying phonemes. Listen to the given audio clip. Based on the following instructions, answer the question. Always attempt to answer." +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.0001 + max_completion_tokens: 64 + +metrics: + - metric: llm_judge_binary + - metric: detailed_judge_prompt diff --git a/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml b/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml new file mode 100644 index 0000000..1309066 --- /dev/null +++ b/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml @@ -0,0 +1,20 @@ +task_name: voxangeles_phoneme_counting +dataset_path: DynamicSuperb/PhoneSegmentCounting_VoxAngeles +modality: audio +language: en +split: test +preprocessor: GeneralPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio +target_column: label +instruction_column: instruction +user_prompt: "You are an expert at counting phones in the context of phonemes, and always attempt to answer. You will be given an audio sample, listen carefully." +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.0001 + max_completion_tokens: 64 + +metrics: + - metric: llm_judge_binary + - metric: detailed_judge_prompt diff --git a/tasks/speech_disorder/voice_disorder/sep_28k.yaml b/tasks/speech_disorder/voice_disorder/sep_28k.yaml new file mode 100644 index 0000000..f0fed09 --- /dev/null +++ b/tasks/speech_disorder/voice_disorder/sep_28k.yaml @@ -0,0 +1,19 @@ +task_name: stuttering_detection +dataset_path: DynamicSuperb/StutteringDetection_SEP28k +modality: audio +language: en +split: test +preprocessor: GeneralPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio +target_column: label +instruction_column: instruction +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.0001 + max_completion_tokens: 64 + +metrics: + - metric: llm_judge_binary + - metric: detailed_judge_prompt diff --git a/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml b/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml new file mode 100644 index 0000000..914833a --- /dev/null +++ b/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml @@ -0,0 +1,19 @@ +task_name: noise_detection +dataset_path: NoiseDetection_LJSpeech_MUSAN-Gaussian +modality: audio +language: en +split: test +preprocessor: GeneralPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio +target_column: label +instruction_column: instruction +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.0001 + max_completion_tokens: 64 + +metrics: + - metric: llm_judge_binary + - metric: detailed_judge_prompt diff --git a/utils/util.py b/utils/util.py index 4dbbe0d..ebdc398 100644 --- a/utils/util.py +++ b/utils/util.py @@ -85,12 +85,12 @@ def validate_config(config: dict, task_configs: dict[Path, list[dict]]) -> Dict: raise ValueError("'filters' must be a dictionary") _validate_filter_values(config['filters']) - # Validate judge_properties as a dictionary - logger.info("---------Validating judge properties---------") - if 'judge_properties' in config: - if not isinstance(config['judge_properties'], dict): - raise ValueError("'judge_properties' must be a dictionary") - _validate_judge_properties(config['judge_properties']) + # Validate judge_settings as a dictionary + logger.info("---------Validating judge settings---------") + if 'judge_settings' in config: + if not isinstance(config['judge_settings'], dict): + raise ValueError("'judge_settings' must be a dictionary") + _validate_judge_settings(config['judge_settings']) # Delegate validation for complex sections logger.info("---------Validating models---------") @@ -175,11 +175,11 @@ def _validate_filter_values(filters: Dict) -> None: raise ValueError("'language' must be a string") -def _validate_judge_properties(judge_props: Dict) -> None: - """Validate the values in the judge_properties dictionary. - +def _validate_judge_settings(judge_props: Dict) -> None: + """Validate the values in the judge_settings dictionary. + Args: - judge_props: Dictionary of judge properties to validate + judge_props: Dictionary of judge settings to validate Raises: ValueError: If any judge property is invalid