From de7a943e2047be16acba3d7799825b1cb72892d6 Mon Sep 17 00:00:00 2001 From: hoang Date: Sun, 21 Sep 2025 16:52:21 +0000 Subject: [PATCH 1/7] Adding MMAR and MMAU-Pro preprocessing --- preprocessors/mmar_preprocessor.py | 140 ++++++++++++++++++ .../accent_recognition/base.yaml | 2 +- .../spoken_language_reasoning/mmar/base.yaml | 22 +++ .../mmar/mmar_mix-music-speech.yaml | 3 + .../mmar/mmar_mix-sound.yaml | 3 + .../mmar/mmar_music.yaml | 3 + .../mmar/mmar_sound.yaml | 3 + .../mmar/mmar_speech.yaml | 3 + .../mmau-pro/base.yaml | 21 +++ .../mmau-pro_instruction_following.yaml | 3 + .../mmau-pro/mmau-pro_multi.yaml | 3 + .../mmau-pro/mmau-pro_music.yaml | 3 + .../mmau-pro/mmau-pro_music_speech.yaml | 3 + .../mmau-pro/mmau-pro_open.yaml | 3 + .../mmau-pro/mmau-pro_sound.yaml | 3 + .../mmau-pro/mmau-pro_sound_music.yaml | 3 + .../mmau-pro/mmau-pro_sound_music_speech.yaml | 3 + .../mmau-pro/mmau-pro_sound_speech.yaml | 3 + .../mmau-pro/mmau-pro_spatial_audio.yaml | 3 + .../mmau-pro/mmau-pro_speech.yaml | 3 + .../mmau-pro/mmau-pro_voice_chat.yaml | 3 + utils/data_utils.py | 36 +++++ utils/util.py | 53 +++++++ 23 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 preprocessors/mmar_preprocessor.py create mode 100644 tasks/spoken_language_reasoning/mmar/base.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_music.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_sound.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/base.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py new file mode 100644 index 0000000..75a4509 --- /dev/null +++ b/preprocessors/mmar_preprocessor.py @@ -0,0 +1,140 @@ +"""General preprocessor module for AU-Harness framework. + +This module provides a general-purpose preprocessor for audio benchmarks +from AudioLLMs and other HuggingFace datasets, with support for various +modalities and filtering options. +""" + +import logging +from typing import Dict, List, Any + +import numpy as np +from tqdm import tqdm +from datasets import Dataset +from preprocessors.base import Preprocessor +from scipy.signal import resample +import soundfile as sf +from urllib.request import urlopen +import io +import os +from dotenv import load_dotenv +from pathlib import Path + + +logger = logging.getLogger(__name__) + +class MmarPreprocessor(Preprocessor): + """Preprocessor for standard Audio benchmarks where output references are ALWAYS expected.""" + + def process(self, dataset: Dataset, task_config: Dict[str, Any], + run_config: Dict[str, Any]) -> List[Dict[str, Any]]: + """Run pre-processing on standard/ general Audio datasets. + + Args: + dataset: The task dataset to pre-process + task_config: Dictionary containing task configuration parameters + run_config: Dictionary containing run configuration parameters + + Returns: + List of dictionaries where each dictionary represents a pre-processed sample + """ + + # Load the local_data_dir saved in predefined .env file + load_dotenv() + local_data_dir = os.getenv("LOCAL_DATA_DIR") + dataset_name = task_config['dataset_path'].split('/')[-1].lower() + + # Extract common properties using base class method + category_name = task_config.get('category_name', 'speech') + audio_column_name = task_config.get('audio_column', None) + target_column_name = task_config.get('target_column', None) + choices_column_name = task_config.get('choices_column', None) + category_column_name = task_config.get('category_column', '') + sample_instruction_column_name = task_config.get('instruction_column', None) + user_query_column_name = task_config.get('textual_input_column', None) + + # Obtain task-specific prompt (if provided) + user_prompt = task_config.get('user_prompt', '') + + # Get dataset info + dataset_keys = list(dataset.features.keys()) + dataset_size = len(dataset) + self.log_dataset_info(dataset_keys, dataset_size) + + # Get dataset filters + length_filter, num_samples_filter = self.get_dataset_filters(run_config.get('filter', None), dataset_size) + + processed_data = [] + total_duration = 0 + sample_count = 0 + + for i, row in enumerate(tqdm(dataset, desc="Processing samples")): + instruction = user_prompt + if (row[category_column_name] != category_name): + continue + # Create record by accessing each feature by index + record = {k: row[k] for k in dataset_keys} + audio_path = record[audio_column_name] + if (isinstance(audio_path, list)): + audio_path = audio_path[0] + + # Mapping audio path to local audio path (sample: $HOME/mmau-pro/data/xyz.wav) + local_audio_path = os.path.join(local_data_dir, dataset_name, audio_path) + audio_array, samplerate = sf.read(local_audio_path) + + # Resample samples if not in 16kHz sampling rate + target_sr = 16000 + if samplerate != target_sr: + num_samples = int(round(audio_array.shape[0] * target_sr / samplerate)) + audio_array = resample(audio_array, num_samples) + samplerate = target_sr + record['array'] = audio_array + record['sampling_rate'] = samplerate + + # Calculate audio duration in seconds + audio_duration = len(record["array"]) / record["sampling_rate"] + total_duration += audio_duration + + # Apply dataset filtering + if (length_filter): + if not self.check_audio_length(record["array"], record["sampling_rate"], length_filter): + continue + if (num_samples_filter): + if sample_count >= num_samples_filter: + break + + # General processor requires reference. Otherwise, implement your own preprocessor. + if target_column_name and target_column_name in record: + record["model_target"] = record.get(target_column_name, None) + else: + raise ValueError("No valid target key found in record") + + # Add sample-specific instructions if they exist in the dataset + if sample_instruction_column_name and sample_instruction_column_name in record: + instruction += record.get(sample_instruction_column_name, "") + + # Append any user-specified prompt add-ons and choices + if choices_column_name and choices_column_name in record: + choices = record.get(choices_column_name, []) + if isinstance(choices, list): + choices_text = " ".join(choices) + else: + choices_text = str(choices) + instruction += "\n Choices: " + choices_text + + # Warning users if no instruction is provided. This can cause evaluated models to hallucinate. + if not instruction: + logger.warning("Instruction is empty for sample %d, add user_prompt for instruction insertion", i) + record["instruction"] = instruction.strip() + + metric_name = task_config.get('metrics') + if ('judge' in metric_name): + judge_type = metric_name.split('_')[-1] + record['judge_type'] = judge_type + else: + record['judge_type'] = 'detailed' + processed_data.append(record) + sample_count += 1 + + self.log_dataset_info(dataset_keys, dataset_size, sample_count, total_duration) + return processed_data diff --git a/tasks/paralinguistics/accent_recognition/base.yaml b/tasks/paralinguistics/accent_recognition/base.yaml index ab228e5..3ad7249 100644 --- a/tasks/paralinguistics/accent_recognition/base.yaml +++ b/tasks/paralinguistics/accent_recognition/base.yaml @@ -11,7 +11,7 @@ prompt: Please listen to the following audio clip and analyze the speaker's voic long_audio_processing_logic: truncate generation_kwargs: - temperature: 0.0001 + temperature: 0.5 max_completion_tokens: 64 metrics: diff --git a/tasks/spoken_language_reasoning/mmar/base.yaml b/tasks/spoken_language_reasoning/mmar/base.yaml new file mode 100644 index 0000000..7e429af --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/base.yaml @@ -0,0 +1,22 @@ +# Base configuration for VoiceBench IFEval tasks +dataset_path: BoJack/MMAR +language: en +split: test +preprocessor: MmarPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio_path +instruction_column: question +target_column: answer +choice_column: choices +category_column: modality + + +user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.2 + max_completion_tokens: 1024 + +metrics: + - metric: llm_judge_binary \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml new file mode 100644 index 0000000..48c6dde --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-music-speech +extends: ["./base.yaml#"] +category_name: mix-music-speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml new file mode 100644 index 0000000..c353622 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound +extends: ["./base.yaml#"] +category_name: mix-sound \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml new file mode 100644 index 0000000..a348d95 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml @@ -0,0 +1,3 @@ +task_name: mmar_music +extends: ["./base.yaml#"] +category_name: music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml new file mode 100644 index 0000000..e5fdbe4 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml @@ -0,0 +1,3 @@ +task_name: mmar_sound +extends: ["./base.yaml#"] +category_name: sound \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml new file mode 100644 index 0000000..2bc7939 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_speech +extends: ["./base.yaml#"] +category_name: speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/base.yaml b/tasks/spoken_language_reasoning/mmau-pro/base.yaml new file mode 100644 index 0000000..f884c1f --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/base.yaml @@ -0,0 +1,21 @@ +# Base configuration for VoiceBench IFEval tasks +dataset_path: gamma-lab-umd/MMAU-Pro +language: en +split: test +preprocessor: MmarPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio_path +instruction_column: question +target_column: answer +choice_column: choices +category_column: category + +user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.2 + max_completion_tokens: 1024 + +metrics: + - metric: llm_judge_binary \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml new file mode 100644 index 0000000..e745120 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_instruction_following +extends: ["./base.yaml#"] +category_name: instruction_following \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml new file mode 100644 index 0000000..ebd55cb --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_multi +extends: ["./base.yaml#"] +category_name: multi \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml new file mode 100644 index 0000000..8ec3ba1 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_music +extends: ["./base.yaml#"] +category_name: music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml new file mode 100644 index 0000000..09e3aab --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_music_speech +extends: ["./base.yaml#"] +category_name: music_speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml new file mode 100644 index 0000000..81408df --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_open +extends: ["./base.yaml#"] +category_name: open \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml new file mode 100644 index 0000000..7fe62f3 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound +extends: ["./base.yaml#"] +category_name: sound \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml new file mode 100644 index 0000000..9fb71c4 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_music +extends: ["./base.yaml#"] +category_name: sound_music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml new file mode 100644 index 0000000..ea25c24 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_music_speech +extends: ["./base.yaml#"] +category_name: music_speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml new file mode 100644 index 0000000..a44a71e --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_speech +extends: ["./base.yaml#"] +category_name: sound_speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml new file mode 100644 index 0000000..2d6f79c --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_spatial_audio +extends: ["./base.yaml#"] +category_name: spatial_audio \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml new file mode 100644 index 0000000..416eead --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_speech +extends: ["./base.yaml#"] +category_name: speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml new file mode 100644 index 0000000..543a91c --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_voice_chat +extends: ["./base.yaml#"] +category_name: voice_chat \ No newline at end of file diff --git a/utils/data_utils.py b/utils/data_utils.py index 886a69a..0edc3e0 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -1,8 +1,12 @@ import os from pathlib import Path from datasets import load_dataset +from dotenv import load_dotenv from utils.util import get_class_from_module +from huggingface_hub import hf_hub_download, HfApi +from . import util import logging +import os logger = logging.getLogger(__name__) @@ -36,8 +40,13 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name if split is None: raise ValueError(f'Dataset split is missing for task {task_name}') + + # Load local environment file + load_dotenv() token=os.getenv("HF_TOKEN") + local_data_dir = os.getenv("LOCAL_DATA_DIR") + api = HfApi() # Load dataset try: @@ -46,6 +55,33 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name dataset_load_args["name"] = subset if token: dataset_load_args["token"] = token + + # Handle processing separately for MMAU-Pro and MMAR + if ('MMAU-Pro' in dataset_path or 'MMAR' in dataset_path): + data_name = dataset_path.split('/')[-1].lower() + private_local_path = os.path.join(local_data_dir, data_name) + if not os.path.exists(private_local_path): + os.mkdir(private_local_path) + + # Find all archive files + files_info = api.list_repo_files(repo_id=dataset_path, repo_type="dataset") + archive_files = [] + for file_info in files_info: + if (file_info.endswith('.zip') or file_info.endswith('.tar.gz')): + archive_files.append(file_info) + + # Download, unzip and store all zip files into local_data_dir + for archive_file in archive_files: + archive_filename = archive_file.split('.')[0] # filename without .zip + desired_audio_storge_path = os.path.join(private_local_path, archive_file) + if (not os.path.exists(desired_audio_storge_path)): + audio_data_dir = hf_hub_download( + repo_id=dataset_path, + filename=archive_file, + repo_type="dataset", + local_dir = private_local_path + ) + util.extract_archive(audio_data_dir, private_local_path) dataset = load_dataset(**dataset_load_args) except Exception as e: raise ValueError(e) diff --git a/utils/util.py b/utils/util.py index 4dbbe0d..4aa0d3e 100644 --- a/utils/util.py +++ b/utils/util.py @@ -7,6 +7,8 @@ import yaml from pathlib import Path from typing import Any, Dict +import tarfile +import zipfile from . import constants from utils.custom_logging import configure from utils.task_utils import _validate_task_metric_pairs, get_groups, get_tasks @@ -24,6 +26,57 @@ def get_class_from_module(module_prefix, module_name): logger.warning(f"Could not import {module_name} from {module_prefix}: {e}") return None +def extract_tar_gz(file_path, extract_path="."): + """ + Extracts a .tar.gz file to a specified path. + Args: + ---- + file_path: str: Path to the archive `.tar.gz` file. + extract_path: str: Directory to extract the contents to. + """ + try: + print ("Tar gz extraction") + with tarfile.open(file_path, "r:gz") as tar: + tar.extractall(path=extract_path) + logger.warning(f"Successfully extracted {file_path} to {extract_path}") + except tarfile.ReadError as e: + logger.warning(f"Error reading tar.gz file: {e}") + except Exception as e: + logger.warning(f"An unexpected error occurred: {e}") + +def extract_zip(file_path, extract_path="."): + """ + Extracts a .zip file to a specified path. + Args: + ---- + file_path: str: Path to the archive `.zip` file. + extract_path: str: Directory to extract the contents to. + """ + try: + with zipfile.ZipFile(file_path, 'r') as zip_ref: + zip_ref.extractall(extract_path) + logger.warning(f"Successfully extracted {file_path} to {extract_path}") + except zipfile.BadZipFile as e: + logger.warning(f"Error reading zip file: {e}") + except Exception as e: + logger.warning(f"An unexpected error occurred: {e}") + +def extract_archive(file_path, extract_path="."): + """ + Extracts either a .tar.gz or .zip file based on its extension. + + Args: + ---- + file_path: str: Path to the archive file. + extract_path: str: Directory to extract the contents to. + """ + if file_path.endswith(".tar.gz"): + extract_tar_gz(file_path, extract_path) + elif file_path.endswith(".zip"): + extract_zip(file_path, extract_path) + else: + logger.warnning(f"Unsupported archive format for file: {file_path}") + def smart_round(val: float, precision: int = constants.ROUND_DIGITS) -> float: """Round off metrics to global precision value. From 97d4b1308be3a2d7024c4f1d45b9bf5de4cb8f9d Mon Sep 17 00:00:00 2001 From: hoang Date: Sun, 21 Sep 2025 16:54:55 +0000 Subject: [PATCH 2/7] Revert unrelated changes to MMAR --- tasks/paralinguistics/accent_recognition/base.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/paralinguistics/accent_recognition/base.yaml b/tasks/paralinguistics/accent_recognition/base.yaml index 3ad7249..ab228e5 100644 --- a/tasks/paralinguistics/accent_recognition/base.yaml +++ b/tasks/paralinguistics/accent_recognition/base.yaml @@ -11,7 +11,7 @@ prompt: Please listen to the following audio clip and analyze the speaker's voic long_audio_processing_logic: truncate generation_kwargs: - temperature: 0.5 + temperature: 0.0001 max_completion_tokens: 64 metrics: From b5d2579dc877d53a37d3feb18fa6f93a4105f612 Mon Sep 17 00:00:00 2001 From: hoang Date: Sun, 21 Sep 2025 17:07:40 +0000 Subject: [PATCH 3/7] Adding documentation for the added tasks --- tasks/spoken_language_reasoning/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tasks/spoken_language_reasoning/README.md b/tasks/spoken_language_reasoning/README.md index e0b11cc..e8e8888 100644 --- a/tasks/spoken_language_reasoning/README.md +++ b/tasks/spoken_language_reasoning/README.md @@ -23,6 +23,11 @@ cd AU-Harness/ bash data/scripts/downnload_spider.sh ``` +## MMAR/ MMAU-PRO +As MMAR and MMAU-PRO requires loading audio files from local audio paths, make sure you set the `LOCAL_PATH_DIR=/path/to/data/storage/location` in your OS Environment (i.e. via .env file) before running evaluation. + +Data preprocessing scripts will download and unzip audios stored from corresponding HF datasets and store them within the `LOCAL_DATA_DIR/${DATASET_NAME}/` where `DATASET_NAME=[mmar|mmau-pro]` + ## 📊 Supported Datasets for Spoken Language Reasoning | Dataset Name | Task type | config | Description | License | @@ -30,4 +35,6 @@ bash data/scripts/downnload_spider.sh | **MTBench** | Speech Instruction Following | [spoken_language_reasoning/mtbench](./mtbench/base.yaml)| Speech-based multi-turn complex instruction following dataset | Apache-2.0 | | **IFEVAL** | Speech Instruction Following | [spoken_language_reasoning/ifeval](./ifeval/base.yaml)| Speech-based complex instruction following dataset | Apache-2.0 | | **BFCL** | Speech Function Calling | [spoken_language_reasoning/bfcl](./bfcl/base.yaml)| Speech-based complex function calling dataset with audio input | Apache-2.0 | -| **SPEECH_TO_SQL** | Speech-to-Coding | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code | Apache-2.0 | \ No newline at end of file +| **SPEECH_TO_SQL** | Speech-to-Coding | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code | Apache-2.0 | +| **MMAR** | Speech-to-Coding | [spoken_language_reasoning/mmar](./mmar/base.yaml)| Benchmark for evaluating deep reasoning capabilities of Audio-Language Models across multi-disciplinary tasks | CC-BY-NC-4.0 | +| **MMAU-PRO** | Speech-to-Coding | [spoken_language_reasoning/mmau-pro](./mmau-pro/base.yaml)| Comprehensive benchmark for evaluating audio intelligence across percentual and reasoning skills | CC-BY-NC-4.0 | \ No newline at end of file From 4e15e11da91327add1eeff3a37197ad4e2dddb07 Mon Sep 17 00:00:00 2001 From: hoang Date: Sun, 21 Sep 2025 17:31:07 +0000 Subject: [PATCH 4/7] Updating comment documentation --- tasks/spoken_language_reasoning/mmar/base.yaml | 2 +- tasks/spoken_language_reasoning/mmau-pro/base.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/spoken_language_reasoning/mmar/base.yaml b/tasks/spoken_language_reasoning/mmar/base.yaml index 7e429af..f43855c 100644 --- a/tasks/spoken_language_reasoning/mmar/base.yaml +++ b/tasks/spoken_language_reasoning/mmar/base.yaml @@ -1,4 +1,4 @@ -# Base configuration for VoiceBench IFEval tasks +# Base configuration for MMAR tasks dataset_path: BoJack/MMAR language: en split: test diff --git a/tasks/spoken_language_reasoning/mmau-pro/base.yaml b/tasks/spoken_language_reasoning/mmau-pro/base.yaml index f884c1f..77e3a73 100644 --- a/tasks/spoken_language_reasoning/mmau-pro/base.yaml +++ b/tasks/spoken_language_reasoning/mmau-pro/base.yaml @@ -1,4 +1,4 @@ -# Base configuration for VoiceBench IFEval tasks +# Base configuration for MMAU-PRO tasks dataset_path: gamma-lab-umd/MMAU-Pro language: en split: test From 25f33ba0db05454aaa9376f4756ee2a8275173e3 Mon Sep 17 00:00:00 2001 From: hoang Date: Sun, 21 Sep 2025 17:36:47 +0000 Subject: [PATCH 5/7] Updating documentation clarification --- preprocessors/mmar_preprocessor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py index 75a4509..cc2eee7 100644 --- a/preprocessors/mmar_preprocessor.py +++ b/preprocessors/mmar_preprocessor.py @@ -1,8 +1,9 @@ -"""General preprocessor module for AU-Harness framework. +"""Reasoning-based preprocessor module for AU-Harness framework. -This module provides a general-purpose preprocessor for audio benchmarks -from AudioLLMs and other HuggingFace datasets, with support for various -modalities and filtering options. +This module provides a preprocessor for audio benchmarks +from AudioLLMs and other HuggingFace datasets, with focus on support of MMAR/MMAU-PRO +where local audio files need to be downloaded, unzipped and loaded from LOCAL_DATA_DIR +when preprocessing. LOCAL_DATA_DIR needs to be set from environment (.env). """ import logging From bd537dee85f089e07a038a455314d6e2a5a8279e Mon Sep 17 00:00:00 2001 From: hoang Date: Mon, 29 Sep 2025 17:51:11 +0000 Subject: [PATCH 6/7] Rectify the sub-task names and add dependency requirement package for dataset loading. --- requirements.txt | 1 + .../mmau-pro/mmau-pro_instruction_following.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba34d86..d2cd2a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,7 @@ pydantic==2.10.5 tenacity==9.1.2 tqdm==4.67.1 setuptools==80.9.0 +dotenv # Loading information from .env pillow==11.1.0 logger==1.4 diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml index e745120..dac46ce 100644 --- a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml @@ -1,3 +1,3 @@ -task_name: mmau-pro_sound_instruction_following +task_name: mmau-pro_instruction_following extends: ["./base.yaml#"] category_name: instruction_following \ No newline at end of file From a9c06acaee03c1b238ecd8c05b67baa8c0184118 Mon Sep 17 00:00:00 2001 From: hoang Date: Wed, 24 Dec 2025 08:50:28 +0000 Subject: [PATCH 7/7] Adding updated support for MMAR sub-modalities --- preprocessors/mmar_preprocessor.py | 1 + .../mmar/mmar_mix-sound-music-speech.yaml | 3 +++ tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml | 3 +++ .../spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml | 3 +++ tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml | 3 --- 5 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml delete mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py index cc2eee7..637a0bf 100644 --- a/preprocessors/mmar_preprocessor.py +++ b/preprocessors/mmar_preprocessor.py @@ -117,6 +117,7 @@ def process(self, dataset: Dataset, task_config: Dict[str, Any], # Append any user-specified prompt add-ons and choices if choices_column_name and choices_column_name in record: choices = record.get(choices_column_name, []) + instruction += "Select one option from the provided choices as the final answer:" if isinstance(choices, list): choices_text = " ".join(choices) else: diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml new file mode 100644 index 0000000..047ef7b --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound-music-speech +extends: ["./base.yaml#"] +category_name: mix-sound-music-speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml new file mode 100644 index 0000000..beea177 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound-music +extends: ["./base.yaml#"] +category_name: mix-sound-music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml new file mode 100644 index 0000000..ab9fc42 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound-speech +extends: ["./base.yaml#"] +category_name: mix-sound-speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml deleted file mode 100644 index c353622..0000000 --- a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml +++ /dev/null @@ -1,3 +0,0 @@ -task_name: mmar_mix-sound -extends: ["./base.yaml#"] -category_name: mix-sound \ No newline at end of file