Skip to content

Add dataset parameters #45

Open
Open
@Ghaandy

Description

@Ghaandy

Hi!

I would suggest adding dataset parameters to allow for handling categorical data.
See the suggested code below. Also note, that the polaris code has been updated to be compatible with the newer versions of polars.

pub fn from_mat(data: Vec<Vec<f64>>, label: Vec<f32>, dataset_parameters: Option<&Value>) -> Result<Self> {
        
        let dataset_params = match dataset_parameters {
            Some(v) => v.as_object()
            .unwrap()
            .iter()
            .map(|(k, v)| format!("{}={}", k, v))
            .collect::<Vec<_>>()
            .join(" "),
            None => "".to_string()
        };
        
        let dataset_params_cstring = CString::new(dataset_params).unwrap();

        let data_length = data.len();
        let feature_length = data[0].len();
        let label_str = CString::new("label").unwrap();
        let reference = std::ptr::null_mut(); // not use
        let mut handle = std::ptr::null_mut();
        let flat_data = data.into_iter().flatten().collect::<Vec<_>>();

        lgbm_call!(lightgbm_sys::LGBM_DatasetCreateFromMat(
            flat_data.as_ptr() as *const c_void,
            lightgbm_sys::C_API_DTYPE_FLOAT64 as i32,
            data_length as i32,
            feature_length as i32,
            1_i32,
            dataset_params_cstring.as_ptr() as *const c_char,
            reference,
            &mut handle
        ))?;

        lgbm_call!(lightgbm_sys::LGBM_DatasetSetField(
            handle,
            label_str.as_ptr() as *const c_char,
            label.as_ptr() as *const c_void,
            data_length as i32,
            lightgbm_sys::C_API_DTYPE_FLOAT32 as i32
        ))?;

        Ok(Self::new(handle))
    }
pub fn from_dataframe(mut dataframe: DataFrame, label_column: String, dataset_parameters: Option<&Value>) -> Result<Self> {
     let label_col_name = label_column.as_str();

     let (m, n) = dataframe.shape();
     
     let label_series = dataframe.column(label_col_name).unwrap().cast(&Float32Type::get_dtype()).unwrap(); 

     if label_series.null_count() != 0 {
         panic!("Cannot create a dataset with null values, encountered nulls when creating the label array")
     }

     dataframe.drop_in_place(label_col_name).unwrap();

     let mut label_values = Vec::with_capacity(m);

     let label_values_ca = label_series.unpack::<Float32Type>().unwrap();

     label_values_ca
         .into_no_null_iter()
         .enumerate()
         .for_each(|(_row_idx, val)| {
             label_values.push(val);
         });

     let mut feature_values = Vec::with_capacity(m);
     for _i in 0..m {
         feature_values.push(Vec::with_capacity(n));
     }

     for (_col_idx, series) in dataframe.get_columns().iter().enumerate() {
         if series.null_count() != 0 {
             panic!("Cannot create a dataset with null values, encountered nulls when creating the features array")
         }
         let series = series.cast(&Float64Type::get_dtype()).unwrap();
         let ca = series.unpack::<Float64Type>().unwrap();

         ca.into_no_null_iter()
             .enumerate()
             .for_each(|(row_idx, val)| feature_values[row_idx].push(val));
     }
     Self::from_mat(feature_values, label_values, dataset_parameters)
 }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions