Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions preprocessors/mmar_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""Reasoning-based preprocessor module for AU-Harness framework.

This module provides a preprocessor for audio benchmarks
from AudioLLMs and other HuggingFace datasets, with focus on support of MMAR/MMAU-PRO
where local audio files need to be downloaded, unzipped and loaded from LOCAL_DATA_DIR
when preprocessing. LOCAL_DATA_DIR needs to be set from environment (.env).
"""

import logging
from typing import Dict, List, Any

import numpy as np
from tqdm import tqdm
from datasets import Dataset
from preprocessors.base import Preprocessor
from scipy.signal import resample
import soundfile as sf
from urllib.request import urlopen
import io
import os
from dotenv import load_dotenv
from pathlib import Path


logger = logging.getLogger(__name__)

class MmarPreprocessor(Preprocessor):
"""Preprocessor for standard Audio benchmarks where output references are ALWAYS expected."""

def process(self, dataset: Dataset, task_config: Dict[str, Any],
run_config: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Run pre-processing on standard/ general Audio datasets.

Args:
dataset: The task dataset to pre-process
task_config: Dictionary containing task configuration parameters
run_config: Dictionary containing run configuration parameters

Returns:
List of dictionaries where each dictionary represents a pre-processed sample
"""

# Load the local_data_dir saved in predefined .env file
load_dotenv()
local_data_dir = os.getenv("LOCAL_DATA_DIR")
dataset_name = task_config['dataset_path'].split('/')[-1].lower()

# Extract common properties using base class method
category_name = task_config.get('category_name', 'speech')
audio_column_name = task_config.get('audio_column', None)
target_column_name = task_config.get('target_column', None)
choices_column_name = task_config.get('choices_column', None)
category_column_name = task_config.get('category_column', '')
sample_instruction_column_name = task_config.get('instruction_column', None)
user_query_column_name = task_config.get('textual_input_column', None)

# Obtain task-specific prompt (if provided)
user_prompt = task_config.get('user_prompt', '')

# Get dataset info
dataset_keys = list(dataset.features.keys())
dataset_size = len(dataset)
self.log_dataset_info(dataset_keys, dataset_size)

# Get dataset filters
length_filter, num_samples_filter = self.get_dataset_filters(run_config.get('filter', None), dataset_size)

processed_data = []
total_duration = 0
sample_count = 0

for i, row in enumerate(tqdm(dataset, desc="Processing samples")):
instruction = user_prompt
if (row[category_column_name] != category_name):
continue
# Create record by accessing each feature by index
record = {k: row[k] for k in dataset_keys}
audio_path = record[audio_column_name]
if (isinstance(audio_path, list)):
audio_path = audio_path[0]

# Mapping audio path to local audio path (sample: $HOME/mmau-pro/data/xyz.wav)
local_audio_path = os.path.join(local_data_dir, dataset_name, audio_path)
audio_array, samplerate = sf.read(local_audio_path)

# Resample samples if not in 16kHz sampling rate
target_sr = 16000
if samplerate != target_sr:
num_samples = int(round(audio_array.shape[0] * target_sr / samplerate))
audio_array = resample(audio_array, num_samples)
samplerate = target_sr
record['array'] = audio_array
record['sampling_rate'] = samplerate

# Calculate audio duration in seconds
audio_duration = len(record["array"]) / record["sampling_rate"]
total_duration += audio_duration

# Apply dataset filtering
if (length_filter):
if not self.check_audio_length(record["array"], record["sampling_rate"], length_filter):
continue
if (num_samples_filter):
if sample_count >= num_samples_filter:
break

# General processor requires reference. Otherwise, implement your own preprocessor.
if target_column_name and target_column_name in record:
record["model_target"] = record.get(target_column_name, None)
else:
raise ValueError("No valid target key found in record")

# Add sample-specific instructions if they exist in the dataset
if sample_instruction_column_name and sample_instruction_column_name in record:
instruction += record.get(sample_instruction_column_name, "")

# Append any user-specified prompt add-ons and choices
if choices_column_name and choices_column_name in record:
choices = record.get(choices_column_name, [])
instruction += "Select one option from the provided choices as the final answer:"
if isinstance(choices, list):
choices_text = " ".join(choices)
else:
choices_text = str(choices)
instruction += "\n Choices: " + choices_text

# Warning users if no instruction is provided. This can cause evaluated models to hallucinate.
if not instruction:
logger.warning("Instruction is empty for sample %d, add user_prompt for instruction insertion", i)
record["instruction"] = instruction.strip()

metric_name = task_config.get('metrics')
if ('judge' in metric_name):
judge_type = metric_name.split('_')[-1]
record['judge_type'] = judge_type
else:
record['judge_type'] = 'detailed'
processed_data.append(record)
sample_count += 1

self.log_dataset_info(dataset_keys, dataset_size, sample_count, total_duration)
return processed_data
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pydantic==2.10.5
tenacity==9.1.2
tqdm==4.67.1
setuptools==80.9.0
dotenv # Loading information from .env

pillow==11.1.0
logger==1.4
Expand Down
9 changes: 8 additions & 1 deletion tasks/spoken_language_reasoning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ cd AU-Harness/
bash data/scripts/downnload_spider.sh
```

## MMAR/ MMAU-PRO
As MMAR and MMAU-PRO requires loading audio files from local audio paths, make sure you set the `LOCAL_PATH_DIR=/path/to/data/storage/location` in your OS Environment (i.e. via .env file) before running evaluation.

Data preprocessing scripts will download and unzip audios stored from corresponding HF datasets and store them within the `LOCAL_DATA_DIR/${DATASET_NAME}/` where `DATASET_NAME=[mmar|mmau-pro]`

## πŸ“Š Supported Datasets for Spoken Language Reasoning

| Dataset Name | Task type | config | Description | License |
Expand All @@ -31,4 +36,6 @@ bash data/scripts/downnload_spider.sh
| **IFEVAL** | Speech Instruction Following | [spoken_language_reasoning/ifeval](./ifeval/base.yaml)| Speech-based complex instruction following dataset | Apache-2.0 |
| **BFCL** | Speech Function Calling | [spoken_language_reasoning/bfcl](./bfcl/base.yaml)| Speech-based complex function calling dataset with audio input | Apache-2.0 |
| **SPEECH_TO_SQL** | Speech-to-Coding | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code | Apache-2.0 |
| **GSM8k** | Grade School Math | [spoken_language_reasoning/gsm8k](./gsm8k/base.yaml)| Speech-based math dataset with grade school math word problems | MIT (text dataset) |
| **MMAR** | Speech-to-Coding | [spoken_language_reasoning/mmar](./mmar/base.yaml)| Benchmark for evaluating deep reasoning capabilities of Audio-Language Models across multi-disciplinary tasks | CC-BY-NC-4.0 |
| **MMAU-PRO** | Speech-to-Coding | [spoken_language_reasoning/mmau-pro](./mmau-pro/base.yaml)| Comprehensive benchmark for evaluating audio intelligence across percentual and reasoning skills | CC-BY-NC-4.0 |
| **GSM8k** | Grade School Math | [spoken_language_reasoning/gsm8k](./gsm8k/base.yaml)| Speech-based math dataset with grade school math word problems | MIT (text dataset) |
22 changes: 22 additions & 0 deletions tasks/spoken_language_reasoning/mmar/base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Base configuration for MMAR tasks
dataset_path: BoJack/MMAR
language: en
split: test
preprocessor: MmarPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio_path
instruction_column: question
target_column: answer
choice_column: choices
category_column: modality


user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question.
long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.2
max_completion_tokens: 1024

metrics:
- metric: llm_judge_binary
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_mix-music-speech
extends: ["./base.yaml#"]
category_name: mix-music-speech
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_mix-sound-music-speech
extends: ["./base.yaml#"]
category_name: mix-sound-music-speech
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_mix-sound-music
extends: ["./base.yaml#"]
category_name: mix-sound-music
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_mix-sound-speech
extends: ["./base.yaml#"]
category_name: mix-sound-speech
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmar/mmar_music.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_music
extends: ["./base.yaml#"]
category_name: music
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmar/mmar_sound.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_sound
extends: ["./base.yaml#"]
category_name: sound
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmar/mmar_speech.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmar_speech
extends: ["./base.yaml#"]
category_name: speech
21 changes: 21 additions & 0 deletions tasks/spoken_language_reasoning/mmau-pro/base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Base configuration for MMAU-PRO tasks
dataset_path: gamma-lab-umd/MMAU-Pro
language: en
split: test
preprocessor: MmarPreprocessor
postprocessor: GeneralPostprocessor
audio_column: audio_path
instruction_column: question
target_column: answer
choice_column: choices
category_column: category

user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question.
long_audio_processing_logic: truncate

generation_kwargs:
temperature: 0.2
max_completion_tokens: 1024

metrics:
- metric: llm_judge_binary
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_instruction_following
extends: ["./base.yaml#"]
category_name: instruction_following
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_multi
extends: ["./base.yaml#"]
category_name: multi
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_music
extends: ["./base.yaml#"]
category_name: music
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_music_speech
extends: ["./base.yaml#"]
category_name: music_speech
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_open
extends: ["./base.yaml#"]
category_name: open
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_sound
extends: ["./base.yaml#"]
category_name: sound
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_sound_music
extends: ["./base.yaml#"]
category_name: sound_music
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_sound_music_speech
extends: ["./base.yaml#"]
category_name: music_speech
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_sound_speech
extends: ["./base.yaml#"]
category_name: sound_speech
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_spatial_audio
extends: ["./base.yaml#"]
category_name: spatial_audio
3 changes: 3 additions & 0 deletions tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_speech
extends: ["./base.yaml#"]
category_name: speech
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task_name: mmau-pro_voice_chat
extends: ["./base.yaml#"]
category_name: voice_chat
36 changes: 36 additions & 0 deletions utils/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
from pathlib import Path
from datasets import load_dataset
from dotenv import load_dotenv
from utils.util import get_class_from_module
from huggingface_hub import hf_hub_download, HfApi
from . import util
import logging
import os

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -36,8 +40,13 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name

if split is None:
raise ValueError(f'Dataset split is missing for task {task_name}')

# Load local environment file
load_dotenv()

token=os.getenv("HF_TOKEN")
local_data_dir = os.getenv("LOCAL_DATA_DIR")
api = HfApi()

# Load dataset
try:
Expand All @@ -46,6 +55,33 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name
dataset_load_args["name"] = subset
if token:
dataset_load_args["token"] = token

# Handle processing separately for MMAU-Pro and MMAR
if ('MMAU-Pro' in dataset_path or 'MMAR' in dataset_path):
data_name = dataset_path.split('/')[-1].lower()
private_local_path = os.path.join(local_data_dir, data_name)
if not os.path.exists(private_local_path):
os.mkdir(private_local_path)

# Find all archive files
files_info = api.list_repo_files(repo_id=dataset_path, repo_type="dataset")
archive_files = []
for file_info in files_info:
if (file_info.endswith('.zip') or file_info.endswith('.tar.gz')):
archive_files.append(file_info)

# Download, unzip and store all zip files into local_data_dir
for archive_file in archive_files:
archive_filename = archive_file.split('.')[0] # filename without .zip
desired_audio_storge_path = os.path.join(private_local_path, archive_file)
if (not os.path.exists(desired_audio_storge_path)):
audio_data_dir = hf_hub_download(
repo_id=dataset_path,
filename=archive_file,
repo_type="dataset",
local_dir = private_local_path
)
util.extract_archive(audio_data_dir, private_local_path)
dataset = load_dataset(**dataset_load_args)
except Exception as e:
raise ValueError(e)
Expand Down
Loading