From 854cb543f76a6c217ec00f79898d5adbe643f3bf Mon Sep 17 00:00:00 2001 From: David Roundy Date: Tue, 9 Jul 2024 16:09:03 -0700 Subject: [PATCH 1/2] use rayon to hopefully speed up linfa-logistic --- algorithms/linfa-logistic/Cargo.toml | 2 +- algorithms/linfa-logistic/src/lib.rs | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/algorithms/linfa-logistic/Cargo.toml b/algorithms/linfa-logistic/Cargo.toml index faf81bab5..d44748119 100644 --- a/algorithms/linfa-logistic/Cargo.toml +++ b/algorithms/linfa-logistic/Cargo.toml @@ -23,7 +23,7 @@ optional = true version = "1.0" [dependencies] -ndarray = { version = "0.15", features = ["approx"] } +ndarray = { version = "0.15", features = ["rayon", "approx"] } ndarray-stats = "0.5.0" num-traits = "0.2" argmin = { version = "0.9.0", default-features = false } diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs index 99addeff8..eea2c2d5e 100644 --- a/algorithms/linfa-logistic/src/lib.rs +++ b/algorithms/linfa-logistic/src/lib.rs @@ -454,9 +454,9 @@ fn log_sum_exp>( /// Computes `exp(n - max) / sum(exp(n- max))`, which is a numerically stable version of softmax fn softmax_inplace>(v: &mut ArrayBase) { let max = v.iter().copied().reduce(F::max).unwrap(); - v.mapv_inplace(|n| (n - max).exp()); + v.par_mapv_inplace(|n| (n - max).exp()); let sum = v.sum(); - v.mapv_inplace(|n| n / sum); + v.par_mapv_inplace(|n| n / sum); } /// Computes the logistic loss assuming the training labels $y \in {-1, 1}$ @@ -479,7 +479,7 @@ fn logistic_loss>( let yz = x.dot(¶ms.into_shape((params.len(), 1)).unwrap()) + intercept; let len = yz.len(); let mut yz = yz.into_shape(len).unwrap() * y; - yz.mapv_inplace(log_logistic); + yz.par_mapv_inplace(log_logistic); -yz.sum() + F::cast(0.5) * alpha * params.dot(¶ms) } @@ -495,8 +495,7 @@ fn logistic_grad>( let yz = x.dot(¶ms.into_shape((params.len(), 1)).unwrap()) + intercept; let len = yz.len(); let mut yz = yz.into_shape(len).unwrap() * y; - yz.mapv_inplace(logistic); - yz -= F::one(); + yz.par_mapv_inplace(|v| logistic(v) - F::one()); yz *= y; if w.len() == n_features + 1 { let mut grad = Array::zeros(w.len()); @@ -624,7 +623,7 @@ impl FittedLogisticRegression { /// model was fitted. pub fn predict_probabilities>(&self, x: &ArrayBase) -> Array1 { let mut probs = x.dot(&self.params) + self.intercept; - probs.mapv_inplace(logistic); + probs.par_mapv_inplace(logistic); probs } } From 5f0fa9e1917132037005b1d42d3e944a384a81b2 Mon Sep 17 00:00:00 2001 From: David Roundy Date: Wed, 10 Jul 2024 13:01:41 -0700 Subject: [PATCH 2/2] add benchmark --- algorithms/linfa-logistic/Cargo.toml | 6 ++ .../linfa-logistic/benches/logistic_bench.rs | 59 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 algorithms/linfa-logistic/benches/logistic_bench.rs diff --git a/algorithms/linfa-logistic/Cargo.toml b/algorithms/linfa-logistic/Cargo.toml index d44748119..cb8beca0c 100644 --- a/algorithms/linfa-logistic/Cargo.toml +++ b/algorithms/linfa-logistic/Cargo.toml @@ -39,3 +39,9 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [ "winequality", ] } rmp-serde = "1" +criterion = "0.4.0" +rand = "0.8.5" + +[[bench]] +name = "logistic_bench" +harness = false diff --git a/algorithms/linfa-logistic/benches/logistic_bench.rs b/algorithms/linfa-logistic/benches/logistic_bench.rs new file mode 100644 index 000000000..348a047b6 --- /dev/null +++ b/algorithms/linfa-logistic/benches/logistic_bench.rs @@ -0,0 +1,59 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use linfa::prelude::*; +use ndarray::{Array1, Ix1}; +use rand::{Rng, SeedableRng}; + +const MAX_ITERATIONS: u64 = 2; + +fn train_model( + dataset: &Dataset, +) -> linfa_logistic::FittedLogisticRegression { + linfa_logistic::LogisticRegression::default() + .max_iterations(MAX_ITERATIONS) + .fit(dataset) + .unwrap() +} + +fn generate_categorical_data(nfeatures: usize, nsamples: usize) -> Dataset { + let mut rng = rand::rngs::SmallRng::seed_from_u64(42); + let mut feature_rows: Vec> = Vec::new(); + let mut label_rows: Vec = Vec::new(); + for _ in 0..nsamples { + let mut features = Vec::new(); + for _ in 0..nfeatures { + let value = if rng.gen() { 1.0 } else { 0.0 }; + features.push(value); + } + feature_rows.push(features); + label_rows.push(rng.gen()); + } + linfa::Dataset::new( + ndarray::Array2::from_shape_vec( + (nsamples, nfeatures), + feature_rows.into_iter().flatten().collect(), + ) + .unwrap(), + Array1::from_shape_vec(label_rows.len(), label_rows).unwrap(), + ) +} + +fn bench(c: &mut Criterion) { + let mut group = c.benchmark_group("Logistic regression"); + group.measurement_time(std::time::Duration::from_secs(10)).sample_size(10); + for nfeatures in [1_000] { + for nsamples in [1_000, 10_000, 100_000, 200_000, 500_000, 1_000_000] { + let input = generate_categorical_data(nfeatures, nsamples); + group.bench_with_input( + BenchmarkId::new("train_model", format!("{:e}x{:e}", nfeatures as f64, nsamples as f64)), + &input, + |b, dataset| { + b.iter(|| train_model(dataset)); + }, + ); + } + } + group.finish(); +} + +criterion_group!(benches, bench); +criterion_main!(benches);