diff --git a/examples/train/livecodebench/lcb_dataset.py b/examples/train/livecodebench/lcb_dataset.py index 396d5a8ec8..4016ff7e40 100644 --- a/examples/train/livecodebench/lcb_dataset.py +++ b/examples/train/livecodebench/lcb_dataset.py @@ -11,7 +11,7 @@ import os from typing import Any, Dict, List, Optional -import pandas as pd +import datasets as hf_datasets """Utility functions for loading and processing datasets.""" @@ -145,17 +145,11 @@ def process_dataset(dataset_name: str, split: str, dataset_dir: str, local_dir: if max_rows: processed_data = processed_data[:max_rows] - # Save individual dataset files - df = pd.DataFrame(processed_data) + ds = hf_datasets.Dataset.from_list(processed_data) parquet_path = os.path.join(local_dir, f"{split}_{dataset_name}.parquet") - json_path = os.path.join(local_dir, f"{split}_{dataset_name}.json") - print(f"Writing {len(df)} rows to {split}_{dataset_name}.parquet") - df.to_parquet(parquet_path) - - if split == "test": - print(f"Writing {len(df)} rows to {split}_{dataset_name}.json") - df.to_json(json_path, orient="records") + print(f"Writing {len(ds)} rows to {split}_{dataset_name}.parquet") + ds.to_parquet(parquet_path) return processed_data @@ -194,7 +188,6 @@ def process_dataset(dataset_name: str, split: str, dataset_dir: str, local_dir: val_data = process_dataset(LIVECODEBENCH, "test", args.dataset_dir, local_dir, args.max_rows) # Save combined train dataset - all_train_df = pd.DataFrame(train_data) - all_train_df.to_parquet(os.path.join(local_dir, "deepcoder_train.parquet")) - all_train_df.to_json(os.path.join(local_dir, "deepcoder_train.json"), orient="records") - print(f"Writing {len(all_train_df)} rows to deepcoder_train.parquet and deepcoder_train.json") + all_train_ds = hf_datasets.Dataset.from_list(train_data) + all_train_ds.to_parquet(os.path.join(local_dir, "deepcoder_train.parquet")) + print(f"Writing {len(all_train_ds)} rows to deepcoder_train.parquet") diff --git a/examples/train/livecodebench/run_lcb.sh b/examples/train/livecodebench/run_lcb.sh index b2a06acf8a..f2de3d139a 100644 --- a/examples/train/livecodebench/run_lcb.sh +++ b/examples/train/livecodebench/run_lcb.sh @@ -5,8 +5,8 @@ set -x # bash examples/livecodebench/run_lcb.sh DATA_DIR="$HOME/data/lcb" -train_data="['${DATA_DIR}/deepcoder_train.json']" -val_data="['${DATA_DIR}/test_livecodebench.json']" +train_data="['${DATA_DIR}/deepcoder_train.parquet']" +val_data="['${DATA_DIR}/test_livecodebench.parquet']" # NOTE (sumanthrh): micro_train_batch_size and micro_forward_batch_size can be tuned uv run --isolated --frozen --extra fsdp -m skyrl.train.entrypoints.main_base \