From de7a943e2047be16acba3d7799825b1cb72892d6 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Sun, 21 Sep 2025 16:52:21 +0000
Subject: [PATCH 1/7] Adding MMAR and MMAU-Pro preprocessing

---
 preprocessors/mmar_preprocessor.py            | 140 ++++++++++++++++++
 .../accent_recognition/base.yaml              |   2 +-
 .../spoken_language_reasoning/mmar/base.yaml  |  22 +++
 .../mmar/mmar_mix-music-speech.yaml           |   3 +
 .../mmar/mmar_mix-sound.yaml                  |   3 +
 .../mmar/mmar_music.yaml                      |   3 +
 .../mmar/mmar_sound.yaml                      |   3 +
 .../mmar/mmar_speech.yaml                     |   3 +
 .../mmau-pro/base.yaml                        |  21 +++
 .../mmau-pro_instruction_following.yaml       |   3 +
 .../mmau-pro/mmau-pro_multi.yaml              |   3 +
 .../mmau-pro/mmau-pro_music.yaml              |   3 +
 .../mmau-pro/mmau-pro_music_speech.yaml       |   3 +
 .../mmau-pro/mmau-pro_open.yaml               |   3 +
 .../mmau-pro/mmau-pro_sound.yaml              |   3 +
 .../mmau-pro/mmau-pro_sound_music.yaml        |   3 +
 .../mmau-pro/mmau-pro_sound_music_speech.yaml |   3 +
 .../mmau-pro/mmau-pro_sound_speech.yaml       |   3 +
 .../mmau-pro/mmau-pro_spatial_audio.yaml      |   3 +
 .../mmau-pro/mmau-pro_speech.yaml             |   3 +
 .../mmau-pro/mmau-pro_voice_chat.yaml         |   3 +
 utils/data_utils.py                           |  36 +++++
 utils/util.py                                 |  53 +++++++
 23 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 preprocessors/mmar_preprocessor.py
 create mode 100644 tasks/spoken_language_reasoning/mmar/base.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_music.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_sound.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/base.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml

diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py
new file mode 100644
index 0000000..75a4509
--- /dev/null
+++ b/preprocessors/mmar_preprocessor.py
@@ -0,0 +1,140 @@
+"""General preprocessor module for AU-Harness framework.
+
+This module provides a general-purpose preprocessor for audio benchmarks
+from AudioLLMs and other HuggingFace datasets, with support for various
+modalities and filtering options.
+"""
+
+import logging
+from typing import Dict, List, Any
+
+import numpy as np
+from tqdm import tqdm
+from datasets import Dataset
+from preprocessors.base import Preprocessor
+from scipy.signal import resample
+import soundfile as sf
+from urllib.request import urlopen
+import io
+import os
+from dotenv import load_dotenv
+from pathlib import Path
+
+
+logger = logging.getLogger(__name__)
+
+class MmarPreprocessor(Preprocessor):
+    """Preprocessor for standard Audio benchmarks where output references are ALWAYS expected."""
+
+    def process(self, dataset: Dataset, task_config: Dict[str, Any], 
+                run_config: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Run pre-processing on standard/ general Audio datasets.
+        
+        Args:
+            dataset: The task dataset to pre-process
+            task_config: Dictionary containing task configuration parameters
+            run_config: Dictionary containing run configuration parameters
+            
+        Returns:
+            List of dictionaries where each dictionary represents a pre-processed sample
+        """
+
+        # Load the local_data_dir saved in predefined .env file        
+        load_dotenv()
+        local_data_dir = os.getenv("LOCAL_DATA_DIR")
+        dataset_name = task_config['dataset_path'].split('/')[-1].lower()
+
+        # Extract common properties using base class method
+        category_name = task_config.get('category_name', 'speech')
+        audio_column_name = task_config.get('audio_column', None)
+        target_column_name = task_config.get('target_column', None)
+        choices_column_name = task_config.get('choices_column', None)
+        category_column_name = task_config.get('category_column', '')
+        sample_instruction_column_name = task_config.get('instruction_column', None)
+        user_query_column_name = task_config.get('textual_input_column', None)
+
+        # Obtain task-specific prompt (if provided)
+        user_prompt = task_config.get('user_prompt', '')
+        
+        # Get dataset info
+        dataset_keys = list(dataset.features.keys())
+        dataset_size = len(dataset)
+        self.log_dataset_info(dataset_keys, dataset_size)
+
+        # Get dataset filters
+        length_filter, num_samples_filter = self.get_dataset_filters(run_config.get('filter', None), dataset_size)
+
+        processed_data = []
+        total_duration = 0
+        sample_count = 0
+
+        for i, row in enumerate(tqdm(dataset, desc="Processing samples")):
+            instruction = user_prompt
+            if (row[category_column_name] != category_name):
+                continue
+            # Create record by accessing each feature by index
+            record = {k: row[k] for k in dataset_keys}
+            audio_path = record[audio_column_name]
+            if (isinstance(audio_path, list)):
+                audio_path = audio_path[0]
+
+            # Mapping audio path to local audio path (sample: $HOME/mmau-pro/data/xyz.wav)
+            local_audio_path = os.path.join(local_data_dir, dataset_name, audio_path)
+            audio_array, samplerate = sf.read(local_audio_path)
+
+            # Resample samples if not in 16kHz sampling rate
+            target_sr = 16000
+            if samplerate != target_sr:
+                num_samples = int(round(audio_array.shape[0] * target_sr / samplerate))
+                audio_array = resample(audio_array, num_samples)
+                samplerate = target_sr
+            record['array'] = audio_array
+            record['sampling_rate'] = samplerate
+            
+            # Calculate audio duration in seconds
+            audio_duration = len(record["array"]) / record["sampling_rate"]
+            total_duration += audio_duration
+
+            # Apply dataset filtering
+            if (length_filter):
+                if not self.check_audio_length(record["array"], record["sampling_rate"], length_filter):
+                    continue
+            if (num_samples_filter):
+                if sample_count >= num_samples_filter:
+                    break
+
+            # General processor requires reference. Otherwise, implement your own preprocessor.
+            if target_column_name and target_column_name in record:
+                record["model_target"] = record.get(target_column_name, None)
+            else:
+                raise ValueError("No valid target key found in record")
+
+            # Add sample-specific instructions if they exist in the dataset
+            if sample_instruction_column_name and sample_instruction_column_name in record:
+                instruction += record.get(sample_instruction_column_name, "")
+            
+            # Append any user-specified prompt add-ons and choices
+            if choices_column_name and choices_column_name in record:
+                choices = record.get(choices_column_name, [])
+                if isinstance(choices, list):
+                    choices_text = " ".join(choices)
+                else:
+                    choices_text = str(choices)
+                instruction += "\n Choices: " + choices_text
+            
+            # Warning users if no instruction is provided. This can cause evaluated models to hallucinate.
+            if not instruction:
+                logger.warning("Instruction is empty for sample %d, add user_prompt for instruction insertion", i)
+            record["instruction"] = instruction.strip()
+
+            metric_name = task_config.get('metrics')
+            if ('judge' in metric_name):
+                judge_type = metric_name.split('_')[-1]
+                record['judge_type'] = judge_type
+            else:
+                record['judge_type'] = 'detailed'
+            processed_data.append(record)
+            sample_count += 1
+
+        self.log_dataset_info(dataset_keys, dataset_size, sample_count, total_duration)
+        return processed_data
diff --git a/tasks/paralinguistics/accent_recognition/base.yaml b/tasks/paralinguistics/accent_recognition/base.yaml
index ab228e5..3ad7249 100644
--- a/tasks/paralinguistics/accent_recognition/base.yaml
+++ b/tasks/paralinguistics/accent_recognition/base.yaml
@@ -11,7 +11,7 @@ prompt: Please listen to the following audio clip and analyze the speaker's voic
 long_audio_processing_logic: truncate
 
 generation_kwargs:
-  temperature: 0.0001
+  temperature: 0.5
   max_completion_tokens: 64
 
 metrics:
diff --git a/tasks/spoken_language_reasoning/mmar/base.yaml b/tasks/spoken_language_reasoning/mmar/base.yaml
new file mode 100644
index 0000000..7e429af
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/base.yaml
@@ -0,0 +1,22 @@
+# Base configuration for VoiceBench IFEval tasks
+dataset_path: BoJack/MMAR
+language: en
+split: test
+preprocessor: MmarPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio_path
+instruction_column: question
+target_column: answer
+choice_column: choices
+category_column: modality
+
+
+user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. 
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.2
+  max_completion_tokens: 1024
+
+metrics:
+  - metric: llm_judge_binary
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml
new file mode 100644
index 0000000..48c6dde
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-music-speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-music-speech
+extends: ["./base.yaml#"]
+category_name: mix-music-speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml
new file mode 100644
index 0000000..c353622
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound
+extends: ["./base.yaml#"]
+category_name: mix-sound
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml
new file mode 100644
index 0000000..a348d95
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_music.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_music
+extends: ["./base.yaml#"]
+category_name: music
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml
new file mode 100644
index 0000000..e5fdbe4
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_sound.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_sound
+extends: ["./base.yaml#"]
+category_name: sound
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml
new file mode 100644
index 0000000..2bc7939
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_speech
+extends: ["./base.yaml#"]
+category_name: speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/base.yaml b/tasks/spoken_language_reasoning/mmau-pro/base.yaml
new file mode 100644
index 0000000..f884c1f
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/base.yaml
@@ -0,0 +1,21 @@
+# Base configuration for VoiceBench IFEval tasks
+dataset_path: gamma-lab-umd/MMAU-Pro
+language: en
+split: test
+preprocessor: MmarPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio_path
+instruction_column: question
+target_column: answer
+choice_column: choices
+category_column: category
+
+user_prompt: Listen to the audio carefully and answer the presented question accordingly. Follow the options that are given in the question. 
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.2
+  max_completion_tokens: 1024
+
+metrics:
+  - metric: llm_judge_binary
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
new file mode 100644
index 0000000..e745120
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_instruction_following
+extends: ["./base.yaml#"]
+category_name: instruction_following
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml
new file mode 100644
index 0000000..ebd55cb
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_multi.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_multi
+extends: ["./base.yaml#"]
+category_name: multi
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml
new file mode 100644
index 0000000..8ec3ba1
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_music
+extends: ["./base.yaml#"]
+category_name: music
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml
new file mode 100644
index 0000000..09e3aab
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_music_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_music_speech
+extends: ["./base.yaml#"]
+category_name: music_speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml
new file mode 100644
index 0000000..81408df
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_open.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_open
+extends: ["./base.yaml#"]
+category_name: open
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml
new file mode 100644
index 0000000..7fe62f3
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound
+extends: ["./base.yaml#"]
+category_name: sound
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml
new file mode 100644
index 0000000..9fb71c4
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_music
+extends: ["./base.yaml#"]
+category_name: sound_music
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml
new file mode 100644
index 0000000..ea25c24
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_music_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_music_speech
+extends: ["./base.yaml#"]
+category_name: music_speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml
new file mode 100644
index 0000000..a44a71e
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_sound_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_sound_speech
+extends: ["./base.yaml#"]
+category_name: sound_speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml
new file mode 100644
index 0000000..2d6f79c
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_spatial_audio.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_spatial_audio
+extends: ["./base.yaml#"]
+category_name: spatial_audio
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml
new file mode 100644
index 0000000..416eead
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_speech
+extends: ["./base.yaml#"]
+category_name: speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml
new file mode 100644
index 0000000..543a91c
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_voice_chat.yaml
@@ -0,0 +1,3 @@
+task_name: mmau-pro_voice_chat
+extends: ["./base.yaml#"]
+category_name: voice_chat
\ No newline at end of file
diff --git a/utils/data_utils.py b/utils/data_utils.py
index 886a69a..0edc3e0 100644
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -1,8 +1,12 @@
 import os
 from pathlib import Path
 from datasets import load_dataset
+from dotenv import load_dotenv
 from utils.util import get_class_from_module
+from huggingface_hub import hf_hub_download, HfApi
+from . import util
 import logging
+import os
 
 logger = logging.getLogger(__name__)
 
@@ -36,8 +40,13 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name
     
     if split is None:
         raise ValueError(f'Dataset split is missing for task {task_name}')
+    
+    # Load local environment file
+    load_dotenv()
 
     token=os.getenv("HF_TOKEN")
+    local_data_dir = os.getenv("LOCAL_DATA_DIR")
+    api = HfApi()
 
     # Load dataset
     try: 
@@ -46,6 +55,33 @@ def load_dataset_with_args(dataset_path: str, split: str, subset: str, task_name
             dataset_load_args["name"] = subset
         if token:
             dataset_load_args["token"] = token
+
+        # Handle processing separately for MMAU-Pro and MMAR
+        if ('MMAU-Pro' in dataset_path or 'MMAR' in dataset_path):
+            data_name = dataset_path.split('/')[-1].lower()
+            private_local_path = os.path.join(local_data_dir, data_name)
+            if not os.path.exists(private_local_path):
+                os.mkdir(private_local_path)
+
+            # Find all archive files
+            files_info = api.list_repo_files(repo_id=dataset_path, repo_type="dataset")
+            archive_files = []
+            for file_info in files_info:
+                if (file_info.endswith('.zip') or file_info.endswith('.tar.gz')):
+                    archive_files.append(file_info)
+
+            # Download, unzip and store all zip files into local_data_dir
+            for archive_file in archive_files:
+                archive_filename = archive_file.split('.')[0] # filename without .zip
+                desired_audio_storge_path = os.path.join(private_local_path, archive_file)
+                if (not os.path.exists(desired_audio_storge_path)):
+                    audio_data_dir = hf_hub_download(
+                        repo_id=dataset_path,
+                        filename=archive_file,
+                        repo_type="dataset",
+                        local_dir = private_local_path
+                    )
+                    util.extract_archive(audio_data_dir, private_local_path)
         dataset = load_dataset(**dataset_load_args)
     except Exception as e:
         raise ValueError(e)
diff --git a/utils/util.py b/utils/util.py
index 4dbbe0d..4aa0d3e 100644
--- a/utils/util.py
+++ b/utils/util.py
@@ -7,6 +7,8 @@
 import yaml
 from pathlib import Path
 from typing import Any, Dict
+import tarfile
+import zipfile
 from . import constants
 from utils.custom_logging import configure
 from utils.task_utils import _validate_task_metric_pairs, get_groups, get_tasks 
@@ -24,6 +26,57 @@ def get_class_from_module(module_prefix, module_name):
         logger.warning(f"Could not import {module_name} from {module_prefix}: {e}")
         return None
 
+def extract_tar_gz(file_path, extract_path="."):
+    """
+        Extracts a .tar.gz file to a specified path.
+        Args:
+        ----
+        file_path: str: Path to the archive `.tar.gz` file.
+        extract_path: str: Directory to extract the contents to.
+    """
+    try:
+        print ("Tar gz extraction")
+        with tarfile.open(file_path, "r:gz") as tar:
+            tar.extractall(path=extract_path)
+        logger.warning(f"Successfully extracted {file_path} to {extract_path}")
+    except tarfile.ReadError as e:
+        logger.warning(f"Error reading tar.gz file: {e}")
+    except Exception as e:
+        logger.warning(f"An unexpected error occurred: {e}")
+
+def extract_zip(file_path, extract_path="."):
+    """
+        Extracts a .zip file to a specified path.
+        Args:
+        ----
+        file_path: str: Path to the archive `.zip` file.
+        extract_path: str: Directory to extract the contents to.
+    """
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_path)
+        logger.warning(f"Successfully extracted {file_path} to {extract_path}")
+    except zipfile.BadZipFile as e:
+        logger.warning(f"Error reading zip file: {e}")
+    except Exception as e:
+        logger.warning(f"An unexpected error occurred: {e}")
+
+def extract_archive(file_path, extract_path="."):
+    """
+        Extracts either a .tar.gz or .zip file based on its extension.
+
+        Args:
+        ----
+        file_path: str: Path to the archive file.
+        extract_path: str: Directory to extract the contents to.
+    """
+    if file_path.endswith(".tar.gz"):
+        extract_tar_gz(file_path, extract_path)
+    elif file_path.endswith(".zip"):
+        extract_zip(file_path, extract_path)
+    else:
+        logger.warnning(f"Unsupported archive format for file: {file_path}")
+
 def smart_round(val: float, precision: int = constants.ROUND_DIGITS) -> float:
     """Round off metrics to global precision value.
 

From 97d4b1308be3a2d7024c4f1d45b9bf5de4cb8f9d Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Sun, 21 Sep 2025 16:54:55 +0000
Subject: [PATCH 2/7] Revert unrelated changes to MMAR

---
 tasks/paralinguistics/accent_recognition/base.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/paralinguistics/accent_recognition/base.yaml b/tasks/paralinguistics/accent_recognition/base.yaml
index 3ad7249..ab228e5 100644
--- a/tasks/paralinguistics/accent_recognition/base.yaml
+++ b/tasks/paralinguistics/accent_recognition/base.yaml
@@ -11,7 +11,7 @@ prompt: Please listen to the following audio clip and analyze the speaker's voic
 long_audio_processing_logic: truncate
 
 generation_kwargs:
-  temperature: 0.5
+  temperature: 0.0001
   max_completion_tokens: 64
 
 metrics:

From b5d2579dc877d53a37d3feb18fa6f93a4105f612 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Sun, 21 Sep 2025 17:07:40 +0000
Subject: [PATCH 3/7] Adding documentation for the added tasks

---
 tasks/spoken_language_reasoning/README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tasks/spoken_language_reasoning/README.md b/tasks/spoken_language_reasoning/README.md
index e0b11cc..e8e8888 100644
--- a/tasks/spoken_language_reasoning/README.md
+++ b/tasks/spoken_language_reasoning/README.md
@@ -23,6 +23,11 @@ cd AU-Harness/
 bash data/scripts/downnload_spider.sh
 ```
 
+## MMAR/ MMAU-PRO
+As MMAR and MMAU-PRO requires loading audio files from local audio paths, make sure you set the `LOCAL_PATH_DIR=/path/to/data/storage/location` in your OS Environment (i.e. via .env file) before running evaluation. 
+
+Data preprocessing scripts will download and unzip audios stored from corresponding HF datasets and store them within the `LOCAL_DATA_DIR/${DATASET_NAME}/` where `DATASET_NAME=[mmar|mmau-pro]` 
+
 ## 📊 Supported Datasets for Spoken Language Reasoning
 
 | Dataset Name                   | Task type       | config | Description                                                                                       | License              |
@@ -30,4 +35,6 @@ bash data/scripts/downnload_spider.sh
 | **MTBench**               | Speech Instruction Following          | [spoken_language_reasoning/mtbench](./mtbench/base.yaml)| Speech-based multi-turn complex instruction following dataset      |    Apache-2.0     |
 | **IFEVAL**               | Speech Instruction Following          | [spoken_language_reasoning/ifeval](./ifeval/base.yaml)| Speech-based complex instruction following dataset    |    Apache-2.0     |
 | **BFCL**               | Speech Function Calling          | [spoken_language_reasoning/bfcl](./bfcl/base.yaml)| Speech-based complex function calling dataset with audio input       |    Apache-2.0    |
-| **SPEECH_TO_SQL**               | Speech-to-Coding         | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code        |    Apache-2.0     |
\ No newline at end of file
+| **SPEECH_TO_SQL**               | Speech-to-Coding         | [spoken_language_reasoning/speech_to_sql](./speech_to_sql/base.yaml)| Speech-based dataset involving following instructions to produce executable code        |    Apache-2.0     |
+| **MMAR**               | Speech-to-Coding         | [spoken_language_reasoning/mmar](./mmar/base.yaml)| Benchmark for evaluating deep reasoning capabilities of Audio-Language Models across multi-disciplinary tasks      |    CC-BY-NC-4.0    |
+| **MMAU-PRO**               | Speech-to-Coding         | [spoken_language_reasoning/mmau-pro](./mmau-pro/base.yaml)| Comprehensive benchmark for evaluating audio intelligence across percentual and reasoning skills        |    CC-BY-NC-4.0    |
\ No newline at end of file

From 4e15e11da91327add1eeff3a37197ad4e2dddb07 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Sun, 21 Sep 2025 17:31:07 +0000
Subject: [PATCH 4/7] Updating comment documentation

---
 tasks/spoken_language_reasoning/mmar/base.yaml     | 2 +-
 tasks/spoken_language_reasoning/mmau-pro/base.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/spoken_language_reasoning/mmar/base.yaml b/tasks/spoken_language_reasoning/mmar/base.yaml
index 7e429af..f43855c 100644
--- a/tasks/spoken_language_reasoning/mmar/base.yaml
+++ b/tasks/spoken_language_reasoning/mmar/base.yaml
@@ -1,4 +1,4 @@
-# Base configuration for VoiceBench IFEval tasks
+# Base configuration for MMAR tasks
 dataset_path: BoJack/MMAR
 language: en
 split: test
diff --git a/tasks/spoken_language_reasoning/mmau-pro/base.yaml b/tasks/spoken_language_reasoning/mmau-pro/base.yaml
index f884c1f..77e3a73 100644
--- a/tasks/spoken_language_reasoning/mmau-pro/base.yaml
+++ b/tasks/spoken_language_reasoning/mmau-pro/base.yaml
@@ -1,4 +1,4 @@
-# Base configuration for VoiceBench IFEval tasks
+# Base configuration for MMAU-PRO tasks
 dataset_path: gamma-lab-umd/MMAU-Pro
 language: en
 split: test

From 25f33ba0db05454aaa9376f4756ee2a8275173e3 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Sun, 21 Sep 2025 17:36:47 +0000
Subject: [PATCH 5/7] Updating documentation clarification

---
 preprocessors/mmar_preprocessor.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py
index 75a4509..cc2eee7 100644
--- a/preprocessors/mmar_preprocessor.py
+++ b/preprocessors/mmar_preprocessor.py
@@ -1,8 +1,9 @@
-"""General preprocessor module for AU-Harness framework.
+"""Reasoning-based preprocessor module for AU-Harness framework.
 
-This module provides a general-purpose preprocessor for audio benchmarks
-from AudioLLMs and other HuggingFace datasets, with support for various
-modalities and filtering options.
+This module provides a preprocessor for audio benchmarks
+from AudioLLMs and other HuggingFace datasets, with focus on support of MMAR/MMAU-PRO
+where local audio files need to be downloaded, unzipped and loaded from LOCAL_DATA_DIR
+when preprocessing. LOCAL_DATA_DIR needs to be set from environment (.env).
 """
 
 import logging

From bd537dee85f089e07a038a455314d6e2a5a8279e Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Mon, 29 Sep 2025 17:51:11 +0000
Subject: [PATCH 6/7] Rectify the sub-task names and add dependency requirement
 package for dataset loading.

---
 requirements.txt                                                | 1 +
 .../mmau-pro/mmau-pro_instruction_following.yaml                | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ba34d86..d2cd2a7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,7 @@ pydantic==2.10.5
 tenacity==9.1.2
 tqdm==4.67.1
 setuptools==80.9.0
+dotenv                  # Loading information from .env 
 
 pillow==11.1.0
 logger==1.4
diff --git a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
index e745120..dac46ce 100644
--- a/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
+++ b/tasks/spoken_language_reasoning/mmau-pro/mmau-pro_instruction_following.yaml
@@ -1,3 +1,3 @@
-task_name: mmau-pro_sound_instruction_following
+task_name: mmau-pro_instruction_following
 extends: ["./base.yaml#"]
 category_name: instruction_following
\ No newline at end of file

From a9c06acaee03c1b238ecd8c05b67baa8c0184118 Mon Sep 17 00:00:00 2001
From: hoang <huuhoang.nguyen@servicenow.com>
Date: Wed, 24 Dec 2025 08:50:28 +0000
Subject: [PATCH 7/7] Adding updated support for MMAR sub-modalities

---
 preprocessors/mmar_preprocessor.py                             | 1 +
 .../mmar/mmar_mix-sound-music-speech.yaml                      | 3 +++
 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml | 3 +++
 .../spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml  | 3 +++
 tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml       | 3 ---
 5 files changed, 10 insertions(+), 3 deletions(-)
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml
 create mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml
 delete mode 100644 tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml

diff --git a/preprocessors/mmar_preprocessor.py b/preprocessors/mmar_preprocessor.py
index cc2eee7..637a0bf 100644
--- a/preprocessors/mmar_preprocessor.py
+++ b/preprocessors/mmar_preprocessor.py
@@ -117,6 +117,7 @@ def process(self, dataset: Dataset, task_config: Dict[str, Any],
             # Append any user-specified prompt add-ons and choices
             if choices_column_name and choices_column_name in record:
                 choices = record.get(choices_column_name, [])
+                instruction += "Select one option from the provided choices as the final answer:"
                 if isinstance(choices, list):
                     choices_text = " ".join(choices)
                 else:
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml
new file mode 100644
index 0000000..047ef7b
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music-speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound-music-speech
+extends: ["./base.yaml#"]
+category_name: mix-sound-music-speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml
new file mode 100644
index 0000000..beea177
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-music.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound-music
+extends: ["./base.yaml#"]
+category_name: mix-sound-music
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml
new file mode 100644
index 0000000..ab9fc42
--- /dev/null
+++ b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound-speech.yaml
@@ -0,0 +1,3 @@
+task_name: mmar_mix-sound-speech
+extends: ["./base.yaml#"]
+category_name: mix-sound-speech
\ No newline at end of file
diff --git a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml b/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml
deleted file mode 100644
index c353622..0000000
--- a/tasks/spoken_language_reasoning/mmar/mmar_mix-sound.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-task_name: mmar_mix-sound
-extends: ["./base.yaml#"]
-category_name: mix-sound
\ No newline at end of file