ServiceNow · nhhoang96 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py
@@ -0,0 +1,142 @@
+"""Reasoning-based preprocessor module for AU-Harness framework.
+
+This module provides a preprocessor for audio benchmarks
+from AudioLLMs and other HuggingFace datasets, with focus on support of MMAR/MMAU-PRO
+where local audio files need to be downloaded, unzipped and loaded from LOCAL_DATA_DIR
+when preprocessing. LOCAL_DATA_DIR needs to be set from environment (.env).
+"""
+
+import logging
+from typing import Dict, List, Any
+
+import numpy as np
+from tqdm import tqdm
+from datasets import Dataset
+from preprocessors.base import Preprocessor
+from scipy.signal import resample
+import soundfile as sf
+from urllib.request import urlopen
+import io
+import os
+from dotenv import load_dotenv
+from pathlib import Path
+
+
+logger = logging.getLogger(__name__)
+
+class MmarPreprocessor(Preprocessor):
+    """Preprocessor for standard Audio benchmarks where output references are ALWAYS expected."""
+
+    def process(self, dataset: Dataset, task_config: Dict[str, Any], 
+                run_config: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Run pre-processing on standard/ general Audio datasets.
+
+        Args:
+            dataset: The task dataset to pre-process
+            task_config: Dictionary containing task configuration parameters
+            run_config: Dictionary containing run configuration parameters
+
+        Returns:
+            List of dictionaries where each dictionary represents a pre-processed sample
+        """
+
+        # Load the local_data_dir saved in predefined .env file        
+        load_dotenv()
+        local_data_dir = os.getenv("LOCAL_DATA_DIR")
+        dataset_name = task_config['dataset_path'].split('/')[-1].lower()
+
+        # Extract common properties using base class method
+        category_name = task_config.get('category_name', 'speech')
+        audio_column_name = task_config.get('audio_column', None)
+        target_column_name = task_config.get('target_column', None)
+        choices_column_name = task_config.get('choices_column', None)
+        category_column_name = task_config.get('category_column', '')
+        sample_instruction_column_name = task_config.get('instruction_column', None)
+        user_query_column_name = task_config.get('textual_input_column', None)
+
+        # Obtain task-specific prompt (if provided)
+        user_prompt = task_config.get('user_prompt', '')
+
+        # Get dataset info
+        dataset_keys = list(dataset.features.keys())
+        dataset_size = len(dataset)
+        self.log_dataset_info(dataset_keys, dataset_size)
+
+        # Get dataset filters
+        length_filter, num_samples_filter = self.get_dataset_filters(run_config.get('filter', None), dataset_size)
+
+        processed_data = []
+        total_duration = 0
+        sample_count = 0
+
+        for i, row in enumerate(tqdm(dataset, desc="Processing samples")):
+            instruction = user_prompt
+            if (row[category_column_name] != category_name):
+                continue
+            # Create record by accessing each feature by index
+            record = {k: row[k] for k in dataset_keys}
+            audio_path = record[audio_column_name]
+            if (isinstance(audio_path, list)):
+                audio_path = audio_path[0]
+
+            # Mapping audio path to local audio path (sample: $HOME/mmau-pro/data/xyz.wav)
+            local_audio_path = os.path.join(local_data_dir, dataset_name, audio_path)
+            audio_array, samplerate = sf.read(local_audio_path)
+
+            # Resample samples if not in 16kHz sampling rate
+            target_sr = 16000
+            if samplerate != target_sr:
+                num_samples = int(round(audio_array.shape[0] * target_sr / samplerate))
+                audio_array = resample(audio_array, num_samples)
+                samplerate = target_sr
+            record['array'] = audio_array
+            record['sampling_rate'] = samplerate
+
+            # Calculate audio duration in seconds
+            audio_duration = len(record["array"]) / record["sampling_rate"]
+            total_duration += audio_duration
+
+            # Apply dataset filtering
+            if (length_filter):
+                if not self.check_audio_length(record["array"], record["sampling_rate"], length_filter):
+                    continue
+            if (num_samples_filter):
+                if sample_count >= num_samples_filter:
+                    break
+
+            # General processor requires reference. Otherwise, implement your own preprocessor.
+            if target_column_name and target_column_name in record:
+                record["model_target"] = record.get(target_column_name, None)
+            else:
+                raise ValueError("No valid target key found in record")
+
+            # Add sample-specific instructions if they exist in the dataset
+            if sample_instruction_column_name and sample_instruction_column_name in record:
+                instruction += record.get(sample_instruction_column_name, "")
+
+            # Append any user-specified prompt add-ons and choices
+            if choices_column_name and choices_column_name in record:
+                choices = record.get(choices_column_name, [])
+                instruction += "Select one option from the provided choices as the final answer:"
+                if isinstance(choices, list):
+                    choices_text = " ".join(choices)
+                else:
+                    choices_text = str(choices)
+                instruction += "\n Choices: " + choices_text
+
+            # Warning users if no instruction is provided. This can cause evaluated models to hallucinate.
+            if not instruction:
+                logger.warning("Instruction is empty for sample %d, add user_prompt for instruction insertion", i)
+            record["instruction"] = instruction.strip()
+
+            metric_name = task_config.get('metrics')
+            if ('judge' in metric_name):
+                judge_type = metric_name.split('_')[-1]
+                record['judge_type'] = judge_type
+            else:
+                record['judge_type'] = 'detailed'
+            processed_data.append(record)
+            sample_count += 1
+
+        self.log_dataset_info(dataset_keys, dataset_size, sample_count, total_duration)
+        return processed_data
diff --git a/requirements.txt b/requirements.txt
@@ -25,6 +25,7 @@ pydantic==2.10.5
 tenacity==9.1.2
 tqdm==4.67.1
 setuptools==80.9.0
+dotenv                  # Loading information from .env 
 
 pillow==11.1.0
 logger==1.4

diff --git a/tasks/spoken_language_reasoning/README.md b/tasks/spoken_language_reasoning/README.md
@@ -23,6 +23,11 @@ cd AU-Harness/
 bash data/scripts/downnload_spider.sh
 ```
 
+## MMAR/ MMAU-PRO
+As MMAR and MMAU-PRO requires loading audio files from local audio paths, make sure you set the `LOCAL_PATH_DIR=/path/to/data/storage/location` in your OS Environment (i.e. via .env file) before running evaluation. 
+
+Data preprocessing scripts will download and unzip audios stored from corresponding HF datasets and store them within the `LOCAL_DATA_DIR/${DATASET_NAME}/` where `DATASET_NAME=[mmar|mmau-pro]` 
+
 ## 📊 Supported Datasets for Spoken Language Reasoning
 
 | Dataset Name                   | Task type       | config | Description                                                                                       | License              |
@@ -31,4 +36,6 @@ bash data/scripts/downnload_spider.sh
 | **IFEVAL**               | Speech Instruction Following          | [spoken_language_reasoning/ifeval](./ifeval/base.yaml)| Speech-based complex instruction following dataset    |    Apache-2.0     |
 | **BFCL**               | Speech Function Calling          | [spoken_language_reasoning/bfcl](./bfcl/base.yaml)| Speech-based complex function calling dataset with audio input       |    Apache-2.0    |
 | **SPEECH_TO_SQL**               | Speech-to-Coding         | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code        |    Apache-2.0     |
-| **GSM8k**               | Grade School Math         | [spoken_language_reasoning/gsm8k](./gsm8k/base.yaml)| Speech-based math dataset with grade school math word problems       |    MIT (text dataset)     |
+| **MMAR**               | Speech-to-Coding         | [spoken_language_reasoning/mmar](./mmar/base.yaml)| Benchmark for evaluating deep reasoning capabilities of Audio-Language Models across multi-disciplinary tasks      |    CC-BY-NC-4.0    |
+| **MMAU-PRO**               | Speech-to-Coding         | [spoken_language_reasoning/mmau-pro](./mmau-pro/base.yaml)| Comprehensive benchmark for evaluating audio intelligence across percentual and reasoning skills        |    CC-BY-NC-4.0    |
+| **GSM8k**               | Grade School Math         | [spoken_language_reasoning/gsm8k](./gsm8k/base.yaml)| Speech-based math dataset with grade school math word problems       |    MIT (text dataset)     |
diff --git a/tasks/spoken_language_reasoning/mmar/base.yaml b/tasks/spoken_language_reasoning/mmar/base.yaml
@@ -0,0 +1,22 @@
+# Base configuration for MMAR tasks
+dataset_path: BoJack/MMAR
+language: en
+split: test
+preprocessor: MmarPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio_path
+instruction_column: question
+target_column: answer
+choice_column: choices
+category_column: modality
+
+
+user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. 
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.2
+  max_completion_tokens: 1024
+
+metrics:
+  - metric: llm_judge_binary
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-music-speech
+extends: ["./base.yaml#"]
+category_name: mix-music-speech
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound-music-speech
+extends: ["./base.yaml#"]
+category_name: mix-sound-music-speech
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound-music
+extends: ["./base.yaml#"]
+category_name: mix-sound-music
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound-speech
+extends: ["./base.yaml#"]
+category_name: mix-sound-speech
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_music
+extends: ["./base.yaml#"]
+category_name: music
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_sound
+extends: ["./base.yaml#"]
+category_name: sound
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_speech
+extends: ["./base.yaml#"]
+category_name: speech
diff --git a/tasks/spoken_language_reasoning/mmau-pro/base.yaml b/tasks/spoken_language_reasoning/mmau-pro/base.yaml
@@ -0,0 +1,21 @@
+# Base configuration for MMAU-PRO tasks
+dataset_path: gamma-lab-umd/MMAU-Pro
+language: en
+split: test
+preprocessor: MmarPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio_path
+instruction_column: question
+target_column: answer
+choice_column: choices
+category_column: category
+
+user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. 
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.2
+  max_completion_tokens: 1024
+
+metrics:
+  - metric: llm_judge_binary
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_instruction_following
+extends: ["./base.yaml#"]
+category_name: instruction_following
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_multi
+extends: ["./base.yaml#"]
+category_name: multi
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_music
+extends: ["./base.yaml#"]
+category_name: music
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_music_speech
+extends: ["./base.yaml#"]
+category_name: music_speech
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_open
+extends: ["./base.yaml#"]
+category_name: open
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound
+extends: ["./base.yaml#"]
+category_name: sound
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_music
+extends: ["./base.yaml#"]
+category_name: sound_music
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_music_speech
+extends: ["./base.yaml#"]
+category_name: music_speech
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_speech
+extends: ["./base.yaml#"]
+category_name: sound_speech
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_spatial_audio
+extends: ["./base.yaml#"]
+category_name: spatial_audio
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_speech
+extends: ["./base.yaml#"]
+category_name: speech
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_voice_chat
+extends: ["./base.yaml#"]
+category_name: voice_chat
diff --git a/utils/data_utils.py b/utils/data_utils.py
@@ -1,8 +1,12 @@
 import os
 from pathlib import Path
 from datasets import load_dataset
+from dotenv import load_dotenv
 from utils.util import get_class_from_module
+from huggingface_hub import hf_hub_download, HfApi
+from . import util
 import logging
+import os
 
 logger = logging.getLogger(__name__)
 
@@ -36,8 +40,13 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name
 
     if split is None:
         raise ValueError(f'Dataset split is missing for task {task_name}')
+
+    # Load local environment file
+    load_dotenv()
 
     token=os.getenv("HF_TOKEN")
+    local_data_dir = os.getenv("LOCAL_DATA_DIR")
+    api = HfApi()
 
     # Load dataset
     try: 
@@ -46,6 +55,33 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name
             dataset_load_args["name"] = subset
         if token:
             dataset_load_args["token"] = token
+
+        # Handle processing separately for MMAU-Pro and MMAR
+        if ('MMAU-Pro' in dataset_path or 'MMAR' in dataset_path):
+            data_name = dataset_path.split('/')[-1].lower()
+            private_local_path = os.path.join(local_data_dir, data_name)
+            if not os.path.exists(private_local_path):
+                os.mkdir(private_local_path)
+
+            # Find all archive files
+            files_info = api.list_repo_files(repo_id=dataset_path, repo_type="dataset")
+            archive_files = []
+            for file_info in files_info:
+                if (file_info.endswith('.zip') or file_info.endswith('.tar.gz')):
+                    archive_files.append(file_info)
+
+            # Download, unzip and store all zip files into local_data_dir
+            for archive_file in archive_files:
+                archive_filename = archive_file.split('.')[0] # filename without .zip
+                desired_audio_storge_path = os.path.join(private_local_path, archive_file)
+                if (not os.path.exists(desired_audio_storge_path)):
+                    audio_data_dir = hf_hub_download(
+                        repo_id=dataset_path,
+                        filename=archive_file,
+                        repo_type="dataset",
+                        local_dir = private_local_path
+                    )
+                    util.extract_archive(audio_data_dir, private_local_path)
         dataset = load_dataset(**dataset_load_args)
     except Exception as e:
         raise ValueError(e)