Skip to content

Commit 8221f87

Browse files
author
Benjamin Feuer
committed
feature selection patching
1 parent 66f7ac4 commit 8221f87

File tree

6 files changed

+123
-47
lines changed

6 files changed

+123
-47
lines changed

examples/tabular/openml_cc18/run_openml_cc18_baselines_tabular.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,8 @@ def evaluate_baselines_on_task(task, split_idx, args):
402402

403403
# Add feature selection parameter
404404
cmd.extend(["--feature_selection_threshold", str(args.feature_selection_threshold)])
405+
if hasattr(args, "feature_selection_method") and args.feature_selection_method:
406+
cmd.extend(["--feature_selection_method", str(args.feature_selection_method)])
405407

406408
# Add few-shot parameters if specified
407409
if args.num_few_shot_examples:

examples/tabular/openml_cc18/run_openml_cc18_fft.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,10 @@ def evaluate_model(task, split_idx, model_dir, args):
436436
str(args.feature_selection_threshold),
437437
]
438438

439+
# Forward feature selection method if provided
440+
if hasattr(args, "feature_selection_method") and args.feature_selection_method:
441+
cmd.extend(["--feature_selection_method", str(args.feature_selection_method)])
442+
439443
# Propagate optional test size limit to evaluation
440444
if getattr(args, "max_test_samples", None) is not None:
441445
cmd.extend(["--max_test_samples", str(args.max_test_samples)])

marvis/models/marvis_tsne.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1101,14 +1101,26 @@ def fit(
11011101
from marvis.utils import apply_feature_reduction
11021102

11031103
# Create a mock dataset dict for feature reduction
1104-
mock_dataset = {"name": "training_data"}
1104+
# Provide attribute_names so downstream selection doesn't fail
1105+
try:
1106+
if hasattr(X_train, "columns"):
1107+
attr_names = list(X_train.columns)
1108+
else:
1109+
attr_names = [f"feature_{i}" for i in range(X_train_array.shape[1])]
1110+
except Exception:
1111+
attr_names = [f"feature_{i}" for i in range(X_train_array.shape[1])]
1112+
1113+
mock_dataset = {"name": "training_data", "attribute_names": attr_names}
11051114
mock_args = type(
11061115
"Args",
11071116
(),
11081117
{
11091118
"feature_selection_threshold": getattr(
11101119
self, "feature_selection_threshold", 500
11111120
),
1121+
"feature_selection_method": getattr(
1122+
self, "feature_selection_method", "pca_variance"
1123+
),
11121124
"seed": self.seed,
11131125
},
11141126
)()

marvis/utils/evaluation_args.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,13 @@ def add_llm_baseline_args(parser: argparse.ArgumentParser):
677677
default=500,
678678
help="Apply feature selection if dataset has more than this many features",
679679
)
680+
parser.add_argument(
681+
"--feature_selection_method",
682+
type=str,
683+
choices=["pca_variance", "mutual_info", "f_score", "token_budget"],
684+
default="token_budget",
685+
help="Feature selection method: traditional top-K (pca_variance, mutual_info, f_score) or token_budget heuristic",
686+
)
680687
parser.add_argument(
681688
"--vlm_model_id",
682689
type=str,

marvis/utils/llm_evaluation_utils.py

Lines changed: 61 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def apply_feature_reduction(
316316
"""
317317
original_num_features = X_train.shape[1]
318318
feature_threshold = getattr(args, "feature_selection_threshold", 20)
319+
method = getattr(args, "feature_selection_method", "pca_variance")
319320

320321
if original_num_features <= feature_threshold:
321322
return X_train, X_test, dataset, None
@@ -325,51 +326,69 @@ def apply_feature_reduction(
325326
)
326327

327328
try:
328-
# Use a simple tokenizer for estimation
329-
from transformers import AutoTokenizer
330-
331-
from .feature_selection_utils import (
332-
create_reduced_dataset,
333-
select_features_for_token_limit,
334-
test_feature_selection,
335-
)
336-
337-
try:
338-
tokenizer_temp = AutoTokenizer.from_pretrained(
339-
"microsoft/phi-3-mini-128k-instruct"
329+
if method in {"pca_variance", "mutual_info", "f_score"}:
330+
# Traditional top-K selection
331+
import numpy as np
332+
from sklearn.preprocessing import StandardScaler
333+
from sklearn.decomposition import PCA
334+
from sklearn.feature_selection import (
335+
f_classif,
336+
f_regression,
337+
mutual_info_classif,
338+
mutual_info_regression,
340339
)
341-
except (OSError, ValueError, RuntimeError):
342-
tokenizer_temp = AutoTokenizer.from_pretrained("gpt2")
343-
344-
# Test different token limits
345-
test_results = test_feature_selection(
346-
X_train,
347-
y_train,
348-
dataset["attribute_names"],
349-
tokenizer_temp,
350-
getattr(args, "num_few_shot_examples", 16),
351-
categorical_indicator=dataset.get("categorical_indicator", None),
352-
)
353340

354-
# Log test results
355-
for result in test_results:
356-
logger.info(
357-
f"Token limit {result['token_limit']}: "
358-
f"{result['num_features_selected']} features selected, "
359-
f"~{result['estimated_tokens']} tokens ({result['utilization']:.1%} utilization)"
360-
)
341+
logger.info(f"Using traditional feature selection method: {method}")
342+
X_num = X_train.values if hasattr(X_train, "values") else X_train
343+
X_num = X_num.astype(float)
344+
k = min(feature_threshold, X_num.shape[1])
345+
y_arr = np.array(y_train)
346+
347+
if method == "pca_variance":
348+
scaler = StandardScaler()
349+
X_scaled = scaler.fit_transform(X_num)
350+
pca = PCA()
351+
pca.fit(X_scaled)
352+
n_pcs = min(5, X_num.shape[1])
353+
importance = np.zeros(X_num.shape[1])
354+
for pc_idx in range(n_pcs):
355+
weight = pca.explained_variance_ratio_[pc_idx]
356+
importance += np.abs(pca.components_[pc_idx]) * weight
357+
scores = importance
358+
elif method == "mutual_info":
359+
if np.issubdtype(y_arr.dtype, np.integer) and len(np.unique(y_arr)) > 1:
360+
scores = mutual_info_classif(X_num, y_arr, random_state=42)
361+
else:
362+
scores = mutual_info_regression(X_num, y_arr, random_state=42)
363+
else: # f_score
364+
if np.issubdtype(y_arr.dtype, np.integer) and len(np.unique(y_arr)) > 1:
365+
scores = f_classif(X_num, y_arr)[0]
366+
else:
367+
scores = f_regression(X_num, y_arr)
361368

362-
# Select features for our target token limit
363-
selected_indices, estimated_tokens = select_features_for_token_limit(
364-
X_train,
365-
y_train,
366-
dataset["attribute_names"],
367-
tokenizer_temp,
368-
num_few_shot_examples=getattr(args, "num_few_shot_examples", 16),
369-
max_tokens=getattr(args, "max_context_length", 8192),
370-
categorical_indicator=dataset.get("categorical_indicator", None),
371-
prioritize_semantic=True,
372-
)
369+
order = np.argsort(scores)
370+
selected_indices = list(order[-k:][::-1])
371+
estimated_tokens = 0
372+
else:
373+
# Token-budget heuristic (prior behavior)
374+
from transformers import AutoTokenizer
375+
try:
376+
tokenizer_temp = AutoTokenizer.from_pretrained(
377+
"microsoft/phi-3-mini-128k-instruct"
378+
)
379+
except (OSError, ValueError, RuntimeError):
380+
tokenizer_temp = AutoTokenizer.from_pretrained("gpt2")
381+
382+
selected_indices, estimated_tokens = select_features_for_token_limit(
383+
X_train,
384+
y_train,
385+
dataset["attribute_names"],
386+
tokenizer_temp,
387+
num_few_shot_examples=getattr(args, "num_few_shot_examples", 16),
388+
max_tokens=getattr(args, "max_context_length", 8192),
389+
categorical_indicator=dataset.get("categorical_indicator", None),
390+
prioritize_semantic=True,
391+
)
373392

374393
# Debug information
375394
logger.info(

scripts/analysis/parse_openml_cc18_results.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,11 +278,19 @@ def normalize_model_name(model_name: str) -> str:
278278

279279
# Map variations to standard names
280280
name_mapping = {
281+
# MARVIS/legacy CLAM aliases
281282
'marvis-t-sne-tabular': 'marvis_tsne',
282283
'marvis_t_sne_tabular': 'marvis_tsne',
283284
'marvis-tsne': 'marvis_tsne',
284285
'marvis-t-sne': 'marvis_tsne',
285286
'marvis_tsne': 'marvis_tsne',
287+
# Legacy CLAM labels encountered in older archives
288+
'clam-t-sne-tabular': 'marvis_tsne',
289+
'clam_t_sne_tabular': 'marvis_tsne',
290+
'clam-tsne': 'marvis_tsne',
291+
'clam-t-sne': 'marvis_tsne',
292+
'clam_tsne': 'marvis_tsne',
293+
'clam': 'marvis_tsne',
286294
'jolt': 'jolt',
287295
'tabllm': 'tabllm',
288296
'tabula-8b': 'tabula_8b',
@@ -350,9 +358,28 @@ def create_unique_model_identifier(model_name: str, archive_source: str, model_u
350358
# Fall back to archive-based suffix
351359
return f'tabllm_{archive_lower.replace("_", "").replace("-", "")}'
352360

353-
# Map marvis_tsne to MARVIS for display
361+
# Disambiguate MARVIS backbones by archive name (e.g., gpt4o, 3b, 32b, qwen)
354362
if normalized_name == 'marvis_tsne':
355-
return 'MARVIS'
363+
arch = (archive_source or '').lower()
364+
# Common identifiers by priority
365+
if ('gpt4o' in arch) or ('openai' in arch) or ('gpt' in arch):
366+
return 'MARVIS_gpt4o'
367+
if '32b' in arch:
368+
return 'MARVIS_32b'
369+
if '3b' in arch:
370+
return 'MARVIS_3b'
371+
if 'qwen' in arch:
372+
return 'MARVIS_qwen'
373+
if 'llama' in arch:
374+
return 'MARVIS_llama'
375+
if 'mistral' in arch:
376+
return 'MARVIS_mistral'
377+
if 'gemma' in arch:
378+
return 'MARVIS_gemma'
379+
# Fallback: derive a compact suffix from archive name
380+
suffix = arch.replace('results', '').replace('marvis', '')
381+
suffix = suffix.replace('-', '').replace('_', '').strip()
382+
return f"MARVIS_{suffix}" if suffix else 'MARVIS'
356383

357384
return normalized_name
358385

@@ -1120,7 +1147,12 @@ def main():
11201147

11211148
# Find tar archives - exclude regression archives
11221149
tar_files = []
1123-
regression_archives = {'jolt_reg.tar', 'clam-reg.tar', 'tabular_baselines_reg.tar'}
1150+
regression_archives = {
1151+
'jolt_reg.tar',
1152+
'clam-reg.tar', # legacy naming
1153+
'marvis-reg.tar', 'marvis_reg.tar', # MARVIS regression naming
1154+
'tabular_baselines_reg.tar'
1155+
}
11241156

11251157
for file_name in os.listdir(results_dir):
11261158
if file_name.endswith('.tar'):
@@ -1195,4 +1227,4 @@ def main():
11951227

11961228

11971229
if __name__ == "__main__":
1198-
main()
1230+
main()

0 commit comments

Comments
 (0)