diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py new file mode 100644 index 0000000..637a0bf --- /dev/null +++ b/preprocessors/mmar_preprocessor.py @@ -0,0 +1,142 @@ +"""Reasoning-based preprocessor module for AU-Harness framework. + +This module provides a preprocessor for audio benchmarks +from AudioLLMs and other HuggingFace datasets, with focus on support of MMAR/MMAU-PRO +where local audio files need to be downloaded, unzipped and loaded from LOCAL_DATA_DIR +when preprocessing. LOCAL_DATA_DIR needs to be set from environment (.env). +""" + +import logging +from typing import Dict, List, Any + +import numpy as np +from tqdm import tqdm +from datasets import Dataset +from preprocessors.base import Preprocessor +from scipy.signal import resample +import soundfile as sf +from urllib.request import urlopen +import io +import os +from dotenv import load_dotenv +from pathlib import Path + + +logger = logging.getLogger(__name__) + +class MmarPreprocessor(Preprocessor): + """Preprocessor for standard Audio benchmarks where output references are ALWAYS expected.""" + + def process(self, dataset: Dataset, task_config: Dict[str, Any], + run_config: Dict[str, Any]) -> List[Dict[str, Any]]: + """Run pre-processing on standard/ general Audio datasets. + + Args: + dataset: The task dataset to pre-process + task_config: Dictionary containing task configuration parameters + run_config: Dictionary containing run configuration parameters + + Returns: + List of dictionaries where each dictionary represents a pre-processed sample + """ + + # Load the local_data_dir saved in predefined .env file + load_dotenv() + local_data_dir = os.getenv("LOCAL_DATA_DIR") + dataset_name = task_config['dataset_path'].split('/')[-1].lower() + + # Extract common properties using base class method + category_name = task_config.get('category_name', 'speech') + audio_column_name = task_config.get('audio_column', None) + target_column_name = task_config.get('target_column', None) + choices_column_name = task_config.get('choices_column', None) + category_column_name = task_config.get('category_column', '') + sample_instruction_column_name = task_config.get('instruction_column', None) + user_query_column_name = task_config.get('textual_input_column', None) + + # Obtain task-specific prompt (if provided) + user_prompt = task_config.get('user_prompt', '') + + # Get dataset info + dataset_keys = list(dataset.features.keys()) + dataset_size = len(dataset) + self.log_dataset_info(dataset_keys, dataset_size) + + # Get dataset filters + length_filter, num_samples_filter = self.get_dataset_filters(run_config.get('filter', None), dataset_size) + + processed_data = [] + total_duration = 0 + sample_count = 0 + + for i, row in enumerate(tqdm(dataset, desc="Processing samples")): + instruction = user_prompt + if (row[category_column_name] != category_name): + continue + # Create record by accessing each feature by index + record = {k: row[k] for k in dataset_keys} + audio_path = record[audio_column_name] + if (isinstance(audio_path, list)): + audio_path = audio_path[0] + + # Mapping audio path to local audio path (sample: $HOME/mmau-pro/data/xyz.wav) + local_audio_path = os.path.join(local_data_dir, dataset_name, audio_path) + audio_array, samplerate = sf.read(local_audio_path) + + # Resample samples if not in 16kHz sampling rate + target_sr = 16000 + if samplerate != target_sr: + num_samples = int(round(audio_array.shape[0] * target_sr / samplerate)) + audio_array = resample(audio_array, num_samples) + samplerate = target_sr + record['array'] = audio_array + record['sampling_rate'] = samplerate + + # Calculate audio duration in seconds + audio_duration = len(record["array"]) / record["sampling_rate"] + total_duration += audio_duration + + # Apply dataset filtering + if (length_filter): + if not self.check_audio_length(record["array"], record["sampling_rate"], length_filter): + continue + if (num_samples_filter): + if sample_count >= num_samples_filter: + break + + # General processor requires reference. Otherwise, implement your own preprocessor. + if target_column_name and target_column_name in record: + record["model_target"] = record.get(target_column_name, None) + else: + raise ValueError("No valid target key found in record") + + # Add sample-specific instructions if they exist in the dataset + if sample_instruction_column_name and sample_instruction_column_name in record: + instruction += record.get(sample_instruction_column_name, "") + + # Append any user-specified prompt add-ons and choices + if choices_column_name and choices_column_name in record: + choices = record.get(choices_column_name, []) + instruction += "Select one option from the provided choices as the final answer:" + if isinstance(choices, list): + choices_text = " ".join(choices) + else: + choices_text = str(choices) + instruction += "\n Choices: " + choices_text + + # Warning users if no instruction is provided. This can cause evaluated models to hallucinate. + if not instruction: + logger.warning("Instruction is empty for sample %d, add user_prompt for instruction insertion", i) + record["instruction"] = instruction.strip() + + metric_name = task_config.get('metrics') + if ('judge' in metric_name): + judge_type = metric_name.split('_')[-1] + record['judge_type'] = judge_type + else: + record['judge_type'] = 'detailed' + processed_data.append(record) + sample_count += 1 + + self.log_dataset_info(dataset_keys, dataset_size, sample_count, total_duration) + return processed_data diff --git a/requirements.txt b/requirements.txt index ba34d86..d2cd2a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,7 @@ pydantic==2.10.5 tenacity==9.1.2 tqdm==4.67.1 setuptools==80.9.0 +dotenv # Loading information from .env pillow==11.1.0 logger==1.4 diff --git a/tasks/spoken_language_reasoning/README.md b/tasks/spoken_language_reasoning/README.md index 2e2f80f..b967994 100644 --- a/tasks/spoken_language_reasoning/README.md +++ b/tasks/spoken_language_reasoning/README.md @@ -23,6 +23,11 @@ cd AU-Harness/ bash data/scripts/downnload_spider.sh ``` +## MMAR/ MMAU-PRO +As MMAR and MMAU-PRO requires loading audio files from local audio paths, make sure you set the `LOCAL_PATH_DIR=/path/to/data/storage/location` in your OS Environment (i.e. via .env file) before running evaluation. + +Data preprocessing scripts will download and unzip audios stored from corresponding HF datasets and store them within the `LOCAL_DATA_DIR/${DATASET_NAME}/` where `DATASET_NAME=[mmar|mmau-pro]` + ## 📊 Supported Datasets for Spoken Language Reasoning | Dataset Name | Task type | config | Description | License | @@ -31,4 +36,6 @@ bash data/scripts/downnload_spider.sh | **IFEVAL** | Speech Instruction Following | [spoken_language_reasoning/ifeval](./ifeval/base.yaml)| Speech-based complex instruction following dataset | Apache-2.0 | | **BFCL** | Speech Function Calling | [spoken_language_reasoning/bfcl](./bfcl/base.yaml)| Speech-based complex function calling dataset with audio input | Apache-2.0 | | **SPEECH_TO_SQL** | Speech-to-Coding | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code | Apache-2.0 | -| **GSM8k** | Grade School Math | [spoken_language_reasoning/gsm8k](./gsm8k/base.yaml)| Speech-based math dataset with grade school math word problems | MIT (text dataset) | \ No newline at end of file +| **MMAR** | Speech-to-Coding | [spoken_language_reasoning/mmar](./mmar/base.yaml)| Benchmark for evaluating deep reasoning capabilities of Audio-Language Models across multi-disciplinary tasks | CC-BY-NC-4.0 | +| **MMAU-PRO** | Speech-to-Coding | [spoken_language_reasoning/mmau-pro](./mmau-pro/base.yaml)| Comprehensive benchmark for evaluating audio intelligence across percentual and reasoning skills | CC-BY-NC-4.0 | +| **GSM8k** | Grade School Math | [spoken_language_reasoning/gsm8k](./gsm8k/base.yaml)| Speech-based math dataset with grade school math word problems | MIT (text dataset) | diff --git a/tasks/spoken_language_reasoning/mmar/base.yaml b/tasks/spoken_language_reasoning/mmar/base.yaml new file mode 100644 index 0000000..f43855c --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/base.yaml @@ -0,0 +1,22 @@ +# Base configuration for MMAR tasks +dataset_path: BoJack/MMAR +language: en +split: test +preprocessor: MmarPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio_path +instruction_column: question +target_column: answer +choice_column: choices +category_column: modality + + +user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.2 + max_completion_tokens: 1024 + +metrics: + - metric: llm_judge_binary \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml new file mode 100644 index 0000000..48c6dde --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-music-speech +extends: ["./base.yaml#"] +category_name: mix-music-speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml new file mode 100644 index 0000000..047ef7b --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound-music-speech +extends: ["./base.yaml#"] +category_name: mix-sound-music-speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml new file mode 100644 index 0000000..beea177 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound-music +extends: ["./base.yaml#"] +category_name: mix-sound-music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml new file mode 100644 index 0000000..ab9fc42 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_mix-sound-speech +extends: ["./base.yaml#"] +category_name: mix-sound-speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml new file mode 100644 index 0000000..a348d95 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml @@ -0,0 +1,3 @@ +task_name: mmar_music +extends: ["./base.yaml#"] +category_name: music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml new file mode 100644 index 0000000..e5fdbe4 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml @@ -0,0 +1,3 @@ +task_name: mmar_sound +extends: ["./base.yaml#"] +category_name: sound \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml new file mode 100644 index 0000000..2bc7939 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmar_speech +extends: ["./base.yaml#"] +category_name: speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/base.yaml b/tasks/spoken_language_reasoning/mmau-pro/base.yaml new file mode 100644 index 0000000..77e3a73 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/base.yaml @@ -0,0 +1,21 @@ +# Base configuration for MMAU-PRO tasks +dataset_path: gamma-lab-umd/MMAU-Pro +language: en +split: test +preprocessor: MmarPreprocessor +postprocessor: GeneralPostprocessor +audio_column: audio_path +instruction_column: question +target_column: answer +choice_column: choices +category_column: category + +user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. +long_audio_processing_logic: truncate + +generation_kwargs: + temperature: 0.2 + max_completion_tokens: 1024 + +metrics: + - metric: llm_judge_binary \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml new file mode 100644 index 0000000..dac46ce --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_instruction_following +extends: ["./base.yaml#"] +category_name: instruction_following \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml new file mode 100644 index 0000000..ebd55cb --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_multi +extends: ["./base.yaml#"] +category_name: multi \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml new file mode 100644 index 0000000..8ec3ba1 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_music +extends: ["./base.yaml#"] +category_name: music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml new file mode 100644 index 0000000..09e3aab --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_music_speech +extends: ["./base.yaml#"] +category_name: music_speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml new file mode 100644 index 0000000..81408df --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_open +extends: ["./base.yaml#"] +category_name: open \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml new file mode 100644 index 0000000..7fe62f3 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound +extends: ["./base.yaml#"] +category_name: sound \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml new file mode 100644 index 0000000..9fb71c4 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_music +extends: ["./base.yaml#"] +category_name: sound_music \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml new file mode 100644 index 0000000..ea25c24 --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_music_speech +extends: ["./base.yaml#"] +category_name: music_speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml new file mode 100644 index 0000000..a44a71e --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_sound_speech +extends: ["./base.yaml#"] +category_name: sound_speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml new file mode 100644 index 0000000..2d6f79c --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_spatial_audio +extends: ["./base.yaml#"] +category_name: spatial_audio \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml new file mode 100644 index 0000000..416eead --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_speech +extends: ["./base.yaml#"] +category_name: speech \ No newline at end of file diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml new file mode 100644 index 0000000..543a91c --- /dev/null +++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml @@ -0,0 +1,3 @@ +task_name: mmau-pro_voice_chat +extends: ["./base.yaml#"] +category_name: voice_chat \ No newline at end of file diff --git a/utils/data_utils.py b/utils/data_utils.py index 886a69a..0edc3e0 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -1,8 +1,12 @@ import os from pathlib import Path from datasets import load_dataset +from dotenv import load_dotenv from utils.util import get_class_from_module +from huggingface_hub import hf_hub_download, HfApi +from . import util import logging +import os logger = logging.getLogger(__name__) @@ -36,8 +40,13 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name if split is None: raise ValueError(f'Dataset split is missing for task {task_name}') + + # Load local environment file + load_dotenv() token=os.getenv("HF_TOKEN") + local_data_dir = os.getenv("LOCAL_DATA_DIR") + api = HfApi() # Load dataset try: @@ -46,6 +55,33 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name dataset_load_args["name"] = subset if token: dataset_load_args["token"] = token + + # Handle processing separately for MMAU-Pro and MMAR + if ('MMAU-Pro' in dataset_path or 'MMAR' in dataset_path): + data_name = dataset_path.split('/')[-1].lower() + private_local_path = os.path.join(local_data_dir, data_name) + if not os.path.exists(private_local_path): + os.mkdir(private_local_path) + + # Find all archive files + files_info = api.list_repo_files(repo_id=dataset_path, repo_type="dataset") + archive_files = [] + for file_info in files_info: + if (file_info.endswith('.zip') or file_info.endswith('.tar.gz')): + archive_files.append(file_info) + + # Download, unzip and store all zip files into local_data_dir + for archive_file in archive_files: + archive_filename = archive_file.split('.')[0] # filename without .zip + desired_audio_storge_path = os.path.join(private_local_path, archive_file) + if (not os.path.exists(desired_audio_storge_path)): + audio_data_dir = hf_hub_download( + repo_id=dataset_path, + filename=archive_file, + repo_type="dataset", + local_dir = private_local_path + ) + util.extract_archive(audio_data_dir, private_local_path) dataset = load_dataset(**dataset_load_args) except Exception as e: raise ValueError(e) diff --git a/utils/util.py b/utils/util.py index 4dbbe0d..4aa0d3e 100644 --- a/utils/util.py +++ b/utils/util.py @@ -7,6 +7,8 @@ import yaml from pathlib import Path from typing import Any, Dict +import tarfile +import zipfile from . import constants from utils.custom_logging import configure from utils.task_utils import _validate_task_metric_pairs, get_groups, get_tasks @@ -24,6 +26,57 @@ def get_class_from_module(module_prefix, module_name): logger.warning(f"Could not import {module_name} from {module_prefix}: {e}") return None +def extract_tar_gz(file_path, extract_path="."): + """ + Extracts a .tar.gz file to a specified path. + Args: + ---- + file_path: str: Path to the archive `.tar.gz` file. + extract_path: str: Directory to extract the contents to. + """ + try: + print ("Tar gz extraction") + with tarfile.open(file_path, "r:gz") as tar: + tar.extractall(path=extract_path) + logger.warning(f"Successfully extracted {file_path} to {extract_path}") + except tarfile.ReadError as e: + logger.warning(f"Error reading tar.gz file: {e}") + except Exception as e: + logger.warning(f"An unexpected error occurred: {e}") + +def extract_zip(file_path, extract_path="."): + """ + Extracts a .zip file to a specified path. + Args: + ---- + file_path: str: Path to the archive `.zip` file. + extract_path: str: Directory to extract the contents to. + """ + try: + with zipfile.ZipFile(file_path, 'r') as zip_ref: + zip_ref.extractall(extract_path) + logger.warning(f"Successfully extracted {file_path} to {extract_path}") + except zipfile.BadZipFile as e: + logger.warning(f"Error reading zip file: {e}") + except Exception as e: + logger.warning(f"An unexpected error occurred: {e}") + +def extract_archive(file_path, extract_path="."): + """ + Extracts either a .tar.gz or .zip file based on its extension. + + Args: + ---- + file_path: str: Path to the archive file. + extract_path: str: Directory to extract the contents to. + """ + if file_path.endswith(".tar.gz"): + extract_tar_gz(file_path, extract_path) + elif file_path.endswith(".zip"): + extract_zip(file_path, extract_path) + else: + logger.warnning(f"Unsupported archive format for file: {file_path}") + def smart_round(val: float, precision: int = constants.ROUND_DIGITS) -> float: """Round off metrics to global precision value.