-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
80 lines (60 loc) · 2.17 KB
/
data_loader.py
File metadata and controls
80 lines (60 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Data loading functions for LLM ODE evolution.
This module contains data loading and variable description generation functions.
"""
import json
from typing import Tuple, Dict
import pandas as pd
def load_dataframes(
problem_name: str,
dim: int,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Load train, test_id, and test_ood datasets for a problem.
Args:
problem_name: Problem name (e.g., "ID_02")
dim: Dimension (2 or 4 in the bundled benchmark set)
Returns:
Tuple of (df_train, df_test_id, df_test_ood)
"""
base_path = f"./data/{dim}D/{problem_name}"
df_train = pd.read_csv(f"{base_path}/{problem_name}_train.csv")
df_test_id = pd.read_csv(f"{base_path}/{problem_name}_test_id.csv")
df_test_ood = pd.read_csv(f"{base_path}/{problem_name}_test_ood.csv")
return df_train, df_test_id, df_test_ood
def create_describe(problem_name: str) -> str:
"""Generate variable description from JSON file.
Args:
problem_name: Problem name (e.g., "ID_02")
Returns:
Formatted variable description string
Raises:
ValueError: If the description file is not found
"""
var_desc_path = f"./data/json/{problem_name}.json"
try:
with open(var_desc_path, 'r', encoding='utf-8') as f:
var_desc = json.load(f)
except FileNotFoundError:
print(f"Warning: Could not find {var_desc_path}. Returning empty describe.")
raise ValueError(f"Could not find {var_desc_path}")
describe = "\n"
describe += var_desc.get("description", "")
return describe.strip()
def create_df_dict(
df_train: pd.DataFrame,
df_test_id: pd.DataFrame,
df_test_ood: pd.DataFrame
) -> Dict[str, pd.DataFrame]:
"""Create a dictionary of dataframes for scoring.
Args:
df_train: Training dataframe
df_test_id: In-distribution test dataframe
df_test_ood: Out-of-distribution test dataframe
Returns:
Dictionary with 'train', 'test_id', 'test_ood' keys
"""
return {
'train': df_train,
'test_id': df_test_id,
'test_ood': df_test_ood
}