feature selection patching

Benjamin Feuer · Benjamin Feuer · commit 8221f87efbac · 2025-09-13T11:44:23.000-04:00
diff --git a/examples/tabular/openml_cc18/run_openml_cc18_baselines_tabular.py b/examples/tabular/openml_cc18/run_openml_cc18_baselines_tabular.py
@@ -402,6 +402,8 @@ def evaluate_baselines_on_task(task, split_idx, args):
 
     # Add feature selection parameter
     cmd.extend(["--feature_selection_threshold", str(args.feature_selection_threshold)])
+    if hasattr(args, "feature_selection_method") and args.feature_selection_method:
+        cmd.extend(["--feature_selection_method", str(args.feature_selection_method)])
 
     # Add few-shot parameters if specified
     if args.num_few_shot_examples:
diff --git a/examples/tabular/openml_cc18/run_openml_cc18_fft.py b/examples/tabular/openml_cc18/run_openml_cc18_fft.py
@@ -436,6 +436,10 @@ def evaluate_model(task, split_idx, model_dir, args):
         str(args.feature_selection_threshold),
     ]
 
+    # Forward feature selection method if provided
+    if hasattr(args, "feature_selection_method") and args.feature_selection_method:
+        cmd.extend(["--feature_selection_method", str(args.feature_selection_method)])
+
     # Propagate optional test size limit to evaluation
     if getattr(args, "max_test_samples", None) is not None:
         cmd.extend(["--max_test_samples", str(args.max_test_samples)])
diff --git a/marvis/models/marvis_tsne.py b/marvis/models/marvis_tsne.py
@@ -1101,14 +1101,26 @@ def fit(
             from marvis.utils import apply_feature_reduction
 
             # Create a mock dataset dict for feature reduction
-            mock_dataset = {"name": "training_data"}
+            # Provide attribute_names so downstream selection doesn't fail
+            try:
+                if hasattr(X_train, "columns"):
+                    attr_names = list(X_train.columns)
+                else:
+                    attr_names = [f"feature_{i}" for i in range(X_train_array.shape[1])]
+            except Exception:
+                attr_names = [f"feature_{i}" for i in range(X_train_array.shape[1])]
+
+            mock_dataset = {"name": "training_data", "attribute_names": attr_names}
             mock_args = type(
                 "Args",
                 (),
                 {
                     "feature_selection_threshold": getattr(
                         self, "feature_selection_threshold", 500
                     ),
+                    "feature_selection_method": getattr(
+                        self, "feature_selection_method", "pca_variance"
+                    ),
                     "seed": self.seed,
                 },
             )()
diff --git a/marvis/utils/evaluation_args.py b/marvis/utils/evaluation_args.py
@@ -677,6 +677,13 @@ def add_llm_baseline_args(parser: argparse.ArgumentParser):
         default=500,
         help="Apply feature selection if dataset has more than this many features",
     )
+    parser.add_argument(
+        "--feature_selection_method",
+        type=str,
+        choices=["pca_variance", "mutual_info", "f_score", "token_budget"],
+        default="token_budget",
+        help="Feature selection method: traditional top-K (pca_variance, mutual_info, f_score) or token_budget heuristic",
+    )
     parser.add_argument(
         "--vlm_model_id",
         type=str,
diff --git a/marvis/utils/llm_evaluation_utils.py b/marvis/utils/llm_evaluation_utils.py
@@ -316,6 +316,7 @@ def apply_feature_reduction(
     """
     original_num_features = X_train.shape[1]
     feature_threshold = getattr(args, "feature_selection_threshold", 20)
+    method = getattr(args, "feature_selection_method", "pca_variance")
 
     if original_num_features <= feature_threshold:
         return X_train, X_test, dataset, None
@@ -325,51 +326,69 @@ def apply_feature_reduction(
     )
 
     try:
-        # Use a simple tokenizer for estimation
-        from transformers import AutoTokenizer
-
-        from .feature_selection_utils import (
-            create_reduced_dataset,
-            select_features_for_token_limit,
-            test_feature_selection,
-        )
-
-        try:
-            tokenizer_temp = AutoTokenizer.from_pretrained(
-                "microsoft/phi-3-mini-128k-instruct"
+        if method in {"pca_variance", "mutual_info", "f_score"}:
+            # Traditional top-K selection
+            import numpy as np
+            from sklearn.preprocessing import StandardScaler
+            from sklearn.decomposition import PCA
+            from sklearn.feature_selection import (
+                f_classif,
+                f_regression,
+                mutual_info_classif,
+                mutual_info_regression,
             )
-        except (OSError, ValueError, RuntimeError):
-            tokenizer_temp = AutoTokenizer.from_pretrained("gpt2")
-
-        # Test different token limits
-        test_results = test_feature_selection(
-            X_train,
-            y_train,
-            dataset["attribute_names"],
-            tokenizer_temp,
-            getattr(args, "num_few_shot_examples", 16),
-            categorical_indicator=dataset.get("categorical_indicator", None),
-        )
 
-        # Log test results
-        for result in test_results:
-            logger.info(
-                f"Token limit {result['token_limit']}: "
-                f"{result['num_features_selected']} features selected, "
-                f"~{result['estimated_tokens']} tokens ({result['utilization']:.1%} utilization)"
-            )
+            logger.info(f"Using traditional feature selection method: {method}")
+            X_num = X_train.values if hasattr(X_train, "values") else X_train
+            X_num = X_num.astype(float)
+            k = min(feature_threshold, X_num.shape[1])
+            y_arr = np.array(y_train)
+
+            if method == "pca_variance":
+                scaler = StandardScaler()
+                X_scaled = scaler.fit_transform(X_num)
+                pca = PCA()
+                pca.fit(X_scaled)
+                n_pcs = min(5, X_num.shape[1])
+                importance = np.zeros(X_num.shape[1])
+                for pc_idx in range(n_pcs):
+                    weight = pca.explained_variance_ratio_[pc_idx]
+                    importance += np.abs(pca.components_[pc_idx]) * weight
+                scores = importance
+            elif method == "mutual_info":
+                if np.issubdtype(y_arr.dtype, np.integer) and len(np.unique(y_arr)) > 1:
+                    scores = mutual_info_classif(X_num, y_arr, random_state=42)
+                else:
+                    scores = mutual_info_regression(X_num, y_arr, random_state=42)
+            else:  # f_score
+                if np.issubdtype(y_arr.dtype, np.integer) and len(np.unique(y_arr)) > 1:
+                    scores = f_classif(X_num, y_arr)[0]
+                else:
+                    scores = f_regression(X_num, y_arr)
 
-        # Select features for our target token limit
-        selected_indices, estimated_tokens = select_features_for_token_limit(
-            X_train,
-            y_train,
-            dataset["attribute_names"],
-            tokenizer_temp,
-            num_few_shot_examples=getattr(args, "num_few_shot_examples", 16),
-            max_tokens=getattr(args, "max_context_length", 8192),
-            categorical_indicator=dataset.get("categorical_indicator", None),
-            prioritize_semantic=True,
-        )
+            order = np.argsort(scores)
+            selected_indices = list(order[-k:][::-1])
+            estimated_tokens = 0
+        else:
+            # Token-budget heuristic (prior behavior)
+            from transformers import AutoTokenizer
+            try:
+                tokenizer_temp = AutoTokenizer.from_pretrained(
+                    "microsoft/phi-3-mini-128k-instruct"
+                )
+            except (OSError, ValueError, RuntimeError):
+                tokenizer_temp = AutoTokenizer.from_pretrained("gpt2")
+
+            selected_indices, estimated_tokens = select_features_for_token_limit(
+                X_train,
+                y_train,
+                dataset["attribute_names"],
+                tokenizer_temp,
+                num_few_shot_examples=getattr(args, "num_few_shot_examples", 16),
+                max_tokens=getattr(args, "max_context_length", 8192),
+                categorical_indicator=dataset.get("categorical_indicator", None),
+                prioritize_semantic=True,
+            )
 
         # Debug information
         logger.info(
diff --git a/scripts/analysis/parse_openml_cc18_results.py b/scripts/analysis/parse_openml_cc18_results.py
@@ -278,11 +278,19 @@ def normalize_model_name(model_name: str) -> str:
     
     # Map variations to standard names
     name_mapping = {
+        # MARVIS/legacy CLAM aliases
         'marvis-t-sne-tabular': 'marvis_tsne',
         'marvis_t_sne_tabular': 'marvis_tsne',
         'marvis-tsne': 'marvis_tsne',
         'marvis-t-sne': 'marvis_tsne',
         'marvis_tsne': 'marvis_tsne',
+        # Legacy CLAM labels encountered in older archives
+        'clam-t-sne-tabular': 'marvis_tsne',
+        'clam_t_sne_tabular': 'marvis_tsne',
+        'clam-tsne': 'marvis_tsne',
+        'clam-t-sne': 'marvis_tsne',
+        'clam_tsne': 'marvis_tsne',
+        'clam': 'marvis_tsne',
         'jolt': 'jolt',
         'tabllm': 'tabllm',
         'tabula-8b': 'tabula_8b',
@@ -350,9 +358,28 @@ def create_unique_model_identifier(model_name: str, archive_source: str, model_u
             # Fall back to archive-based suffix
             return f'tabllm_{archive_lower.replace("_", "").replace("-", "")}'
     
-    # Map marvis_tsne to MARVIS for display
+    # Disambiguate MARVIS backbones by archive name (e.g., gpt4o, 3b, 32b, qwen)
     if normalized_name == 'marvis_tsne':
-        return 'MARVIS'
+        arch = (archive_source or '').lower()
+        # Common identifiers by priority
+        if ('gpt4o' in arch) or ('openai' in arch) or ('gpt' in arch):
+            return 'MARVIS_gpt4o'
+        if '32b' in arch:
+            return 'MARVIS_32b'
+        if '3b' in arch:
+            return 'MARVIS_3b'
+        if 'qwen' in arch:
+            return 'MARVIS_qwen'
+        if 'llama' in arch:
+            return 'MARVIS_llama'
+        if 'mistral' in arch:
+            return 'MARVIS_mistral'
+        if 'gemma' in arch:
+            return 'MARVIS_gemma'
+        # Fallback: derive a compact suffix from archive name
+        suffix = arch.replace('results', '').replace('marvis', '')
+        suffix = suffix.replace('-', '').replace('_', '').strip()
+        return f"MARVIS_{suffix}" if suffix else 'MARVIS'
     
     return normalized_name
 
@@ -1120,7 +1147,12 @@ def main():
     
     # Find tar archives - exclude regression archives
     tar_files = []
-    regression_archives = {'jolt_reg.tar', 'clam-reg.tar', 'tabular_baselines_reg.tar'}
+    regression_archives = {
+        'jolt_reg.tar',
+        'clam-reg.tar',  # legacy naming
+        'marvis-reg.tar', 'marvis_reg.tar',  # MARVIS regression naming
+        'tabular_baselines_reg.tar'
+    }
     
     for file_name in os.listdir(results_dir):
         if file_name.endswith('.tar'):
@@ -1195,4 +1227,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()