Release 0.8 (#421)

bsc-wdc · Nov 11, 2022 · 8b4795e · 8b4795e
1 parent 7579891
commit 8b4795e
Show file tree

Hide file tree

Showing 7 changed files with 196 additions and 71 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,48 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.8.0] - 2022-11-11
+### Added
+- `save` and `load` methods for all models
+- Adding Muliclass CSVM
+- Adding TS-QR (Tall Skinny QR)
+- New in-place operations for ds-arrays:
+  `add` `iadd` `isub`
+- Matrix-Subtraction and Matrix-Addition
+- Concatenating two ds-arrays by columns
+- Save ds-array to `npy` file
+- Load ds-array from several `npy` files
+- Create ds-arrays from blocks
+- GridSearch for simulations & improvements
+- Inverse transformation in Scalers
+- Train-Test-Split functionality
+- Add KNN Classifier
+- Better SVD columns pairing
+- GPU Support using CUDA/CuPy for algorithms: Kmeans, KNN, SVD, PCA, Matmul, Addition, Subtraction, QR, Kronecker
+
+### Changed
+- New documentation for GPU, RandomForest, Scalers
+
+### Fixed
+- Fix bug Scalers & tests
+
+## [0.7.0] - 2021-11-10
+### Added
+- New decomposition algorithm QR
+- New preprocessing algorithm MinMaxScaler
+- Jenkinsfile for CI automated tests
+- ds-array matrix multiplication (matmul)
+- New function for ds-array creation
+- Add `@constraint(computing_units="${ComputingUnits}")` to all tasks
+- More I/O functions for reading and writing ds-arrays
+- More tests
+
+### Changed
+- Move RandomForest from 'classification' to 'trees'
+
+### Fixed
+- Some bugs in the ds-array
+
 ## [0.6.0] - 2020-10-09
 ### Added
 - User guide and glossary

diff --git a/NOTICE b/NOTICE
@@ -1,4 +1,4 @@
-Copyright 2019-2020 Barcelona Supercomputing Center (BSC)
+Copyright 2019-2022 Barcelona Supercomputing Center (BSC)
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.7.0
+0.8.0
diff --git a/dislib/math/base.py b/dislib/math/base.py
@@ -1,4 +1,5 @@
 import itertools
+import math
 
 import numpy as np
 import dislib
@@ -173,6 +174,7 @@ def svd(a, compute_uv=True, sort=True, copy=True, eps=1e-9):
         v = identity(x.shape[1], (x._reg_shape[1], x._reg_shape[1]))
 
     checks = [True]
+    n_cols = x._n_blocks[1]
 
     if dislib.__gpu_available__:
         _compute_rotation_func = _compute_rotation_and_rotate_gpu
@@ -182,11 +184,7 @@ def svd(a, compute_uv=True, sort=True, copy=True, eps=1e-9):
     while not _check_convergence_svd(checks):
         checks = []
 
-        pairings = itertools.combinations(
-            range(x._n_blocks[1]), 2
-        )
-
-        for i, j in pairings:
+        for i, j in svd_col_combs(n_cols):
             coli_x = x._get_col_block(i)
             colj_x = x._get_col_block(j)
 
@@ -393,7 +391,7 @@ def _compute_u_block_sorted_gpu(a_block, index, bsize, sorting, u_block):
 @task(block={Type: COLLECTION_IN, Depth: 1},
       out_blocks={Type: COLLECTION_OUT, Depth: 1})
 def _merge_svd_block(block, index, hbsize, vbsize, sorting, out_blocks):
-    block = list(filter(lambda a: a != [], block))  # remove empty lists
+    block = list(filter(lambda a: np.any(a), block))  # remove empty lists
     col = np.vstack(block).T
     local_sorting = []
 
@@ -577,3 +575,55 @@ def _kron_gpu(block1, block2, out_blocks):
     for i in range(block1_gpu.shape[0]):
         for j in range(block1_gpu.shape[1]):
             out_blocks[i][j] = cp.asnumpy(block1_gpu[i, j] * block2_gpu)
+
+
+def _combinations(a, b):
+    # First get all combinations between a and b
+    n = len(a)
+    coverages = list()
+
+    for i in range(n):
+        single_cov = list()
+        for a_idx in range(n):
+            b_idx = (a_idx + i) % n
+            single_cov.append((a[a_idx], b[b_idx]))
+        coverages.append(single_cov)
+
+    # Now get coverages of a and b independently
+    if n == 1:
+        return coverages
+    elif n == 2:
+        coverages.append([(a[0], a[1]), (b[0], b[1])])
+    else:
+        m = n // 2
+        a1 = a[:m]
+        a2 = a[m:]
+        b1 = b[:m]
+        b2 = b[m:]
+
+        coverages_a = _combinations(a1, a2)
+        coverages_b = _combinations(b1, b2)
+
+        for cov_a, cov_b in zip(coverages_a, coverages_b):
+            coverages.append(cov_a + cov_b)
+
+    return coverages
+
+
+def svd_col_combs(n_cols: int):
+    if n_cols <= 1:
+        return list()
+
+    cols = list(range(2**math.ceil(math.log(n_cols, 2))))
+
+    n = len(cols) // 2
+
+    a = cols[:n]
+    b = cols[n:]
+
+    coverages = _combinations(a, b)
+
+    coverages = sum(coverages, list())
+    all_combs = list(itertools.combinations(range(n_cols), 2))
+    pairings = list(filter(lambda x: x in all_combs, coverages))
+    return pairings
diff --git a/dislib/trees/forest.py b/dislib/trees/forest.py
@@ -483,7 +483,11 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
         >>> assert np.allclose(model_pred.collect(),
         >>> loaded_model_pred.collect())
         """
-        super().save_model(filepath, overwrite=overwrite, save_format=save_format)
+        super().save_model(
+            filepath,
+            overwrite=overwrite,
+            save_format=save_format
+        )
 
 
 class RandomForestRegressor(BaseRandomForest):
@@ -688,7 +692,11 @@ def save_model(self, filepath, overwrite=True, save_format="json"):
         >>> assert np.allclose(model_pred.collect(),
         >>> loaded_model_pred.collect())
         """
-        super().save_model(filepath, overwrite=overwrite, save_format=save_format)
+        super().save_model(
+            filepath,
+            overwrite=overwrite,
+            save_format=save_format
+        )
 
 
 def _base_soft_vote(classes, *predictions):

diff --git a/tests/test_array.py b/tests/test_array.py
@@ -1318,67 +1318,6 @@ def test_kron_regular(self, a_shape, a_bsize, b_shape, b_bsize):
         self.assertTrue(_validate_array(computed))
         self.assertTrue(_equal_arrays(computed.collect(), expected))
 
-    @parameterized.expand([(ds.array([[1, 0, 0, 0],
-                                      [0, 0, 0, 2],
-                                      [0, 3, 0, 0],
-                                      [2, 0, 0, 0]], (2, 2)),),
-                           (ds.random_array((17, 5), (1, 1)),),
-                           (ds.random_array((9, 7), (9, 6)),),
-                           (ds.random_array((10, 10), (2, 2))[1:, 1:],)])
-    def test_svd(self, x):
-        """ Tests SVD """
-        x_np = x.collect()
-        u, s, v = ds.svd(x)
-        u = u.collect()
-        s = np.diag(s.collect())
-        v = v.collect()
-
-        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
-
-        u, s, v = ds.svd(x, sort=False)
-        u = u.collect()
-        s = np.diag(s.collect())
-        v = v.collect()
-
-        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
-
-        s = ds.svd(x, compute_uv=False, sort=False)
-        s = np.diag(s.collect())
-
-        # use U and V from previous decomposition
-        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
-
-        u, s, v = ds.svd(x, copy=False)
-        u = u.collect()
-        s = np.diag(s.collect())
-        v = v.collect()
-
-        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
-        self.assertTrue(
-            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
-
-    def test_svd_errors(self):
-        """ Tests SVD raises """
-        with self.assertRaises(ValueError):
-            ds.svd(ds.random_array((3, 9), (2, 2)))
-
-        with self.assertRaises(ValueError):
-            ds.svd(ds.random_array((3, 3), (3, 3)))
-
 
 def main():
     unittest.main()

diff --git a/tests/test_svd.py b/tests/test_svd.py
@@ -0,0 +1,86 @@
+import unittest
+
+import numpy as np
+import itertools
+
+import dislib as ds
+from parameterized import parameterized
+from dislib.math.base import svd_col_combs
+
+
+class SVDTest(unittest.TestCase):
+
+    def test_pairing(self):
+        for n_cols in range(10):
+
+            all_combs = list(itertools.combinations(range(n_cols), 2))
+            cols_combs = svd_col_combs(n_cols)
+
+            assert set(all_combs) == set(cols_combs)
+
+    @parameterized.expand([(ds.array([[1, 0, 0, 0],
+                                      [0, 0, 0, 2],
+                                      [0, 3, 0, 0],
+                                      [2, 0, 0, 0]], (2, 2)),),
+                           (ds.random_array((17, 5), (1, 1)),),
+                           (ds.random_array((9, 7), (9, 6)),),
+                           (ds.random_array((10, 10), (2, 2))[1:, 1:],)])
+    def test_svd(self, x):
+        x_np = x.collect()
+        u, s, v = ds.svd(x)
+        u = u.collect()
+        s = np.diag(s.collect())
+        v = v.collect()
+
+        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
+
+        u, s, v = ds.svd(x, sort=False)
+        u = u.collect()
+        s = np.diag(s.collect())
+        v = v.collect()
+
+        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
+
+        s = ds.svd(x, compute_uv=False, sort=False)
+        s = np.diag(s.collect())
+
+        # use U and V from previous decomposition
+        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
+
+        u, s, v = ds.svd(x, copy=False)
+        u = u.collect()
+        s = np.diag(s.collect())
+        v = v.collect()
+
+        self.assertTrue(np.allclose(x_np, u @ s @ v.T))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(u, axis=0), np.ones(u.shape[1])))
+        self.assertTrue(
+            np.allclose(np.linalg.norm(v, axis=0), np.ones(v.shape[1])))
+
+    def test_svd_errors(self):
+        with self.assertRaises(ValueError):
+            ds.svd(ds.random_array((3, 9), (2, 2)))
+
+        with self.assertRaises(ValueError):
+            ds.svd(ds.random_array((3, 3), (3, 3)))
+
+
+def main():
+    unittest.main()
+
+
+if __name__ == '__main__':
+    main()