From 9a2a5aca2f09982b2aa1015d90992a6cf14563a9 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Wed, 11 Dec 2019 16:45:05 +0100
Subject: [PATCH 01/11] add benchmarking using pseudorandom data frames

---
 test/run-benchmark.jl | 69 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 test/run-benchmark.jl

diff --git a/test/run-benchmark.jl b/test/run-benchmark.jl
new file mode 100644
index 0000000..5d24ff9
--- /dev/null
+++ b/test/run-benchmark.jl
@@ -0,0 +1,69 @@
+using SOM
+using Test
+#
+using DataFrames
+using RDatasets
+using Primes
+
+function generate_integer_hash_function(a,b,cardinality)
+    # cardinality must be prime number
+    # a,b: 1 ≤ a ≤ p−1  , 0 ≤ b ≤ p−1
+    @assert a < cardinality
+    @assert a > 0
+    @assert b < cardinality
+    @assert b >= 0
+    # @assert isprime(cardinality)
+    m = cardinality
+    x -> (x^a%m + b)%m  #TODO use modular exponentiation
+end
+
+function generate_vector_deterministic_pseudorandom(seed,hash_func,dimension)
+    [hash_func(x+seed) for x in 1:dimension]
+end
+
+function generate_dataset(number,dimension,max_value)
+    # dimension = 16
+    # max_value = 1013
+    params = Dict(:a => 13, :b => 17, :hash_seed => 23)
+    hash_func = generate_integer_hash_function(params[:a],params[:b],max_value)
+
+
+    seed = params[:hash_seed]
+    matrix = zeros((number,dimension))
+    for i in 1:number
+        vector = generate_vector_deterministic_pseudorandom(seed,hash_func,dimension)
+        seed = hash_func(seed+i)
+        matrix[i,:] = vector
+    end
+    convert(DataFrame,matrix)
+end
+
+
+include("testFuns.jl")
+
+
+
+function benchmarkTrain(train, topol; toroidal = false,
+                    normType = :zscore, kernel = gaussianKernel)
+
+    xdim = 10
+    ydim = 10
+
+    som = initSOM(train, xdim, ydim, norm = normType,
+                  topol = topol, toroidal = toroidal)
+    som = trainSOM(som, train, 1000, kernelFun = kernel)
+
+    ntrain = nrow(train)
+    npopul = sum(som.population)
+
+    return ntrain == npopul
+end
+
+
+train = generate_dataset(2000000,3,1013)
+
+# test hexagonal, rectangular and spherical training:
+#
+# @test benchmarkTrain(train, :hexagonal, toroidal = false)
+@time benchmarkTrain(train, :rectangular, toroidal = false)
+# @test benchmarkTrain(train, :spherical, toroidal = false)

From 2dab4598348c69fd04bb1d52a9471d10f4573fca Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 01:11:19 +0100
Subject: [PATCH 02/11] add kd-tree nearest neighbor search for faster
 performance in some cases

---
 src/SOM.jl     |  1 +
 src/helpers.jl | 11 +++++++++++
 src/soms.jl    |  5 ++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/SOM.jl b/src/SOM.jl
index 2b738b6..20029a3 100644
--- a/src/SOM.jl
+++ b/src/SOM.jl
@@ -15,6 +15,7 @@ using Distances
 using ProgressMeter
 using StatsBase
 using Distributions
+using NearestNeighbors
 #using TensorToolbox
 using LinearAlgebra
 # if VERSION < v"0.7.0-DEV.5183"
diff --git a/src/helpers.jl b/src/helpers.jl
index 8e365eb..35e6b33 100644
--- a/src/helpers.jl
+++ b/src/helpers.jl
@@ -1,3 +1,5 @@
+using NearestNeighbors
+
 import DataFrames: nrow, ncol
 
 """
@@ -90,6 +92,15 @@ function findWinner(cod, sampl)
     return winner
 end
 
+function findWinnerKD(kd_tree, sampl)
+  idxs, _ = knn(kd_tree, sampl, 1)
+  idxs[1]
+end
+
+function get_kd_tree(codes)
+    codes_transpose = permutedims(codes,(2,1))
+    KDTree(codes_transpose)
+end
 
 """
     normTrainData(x, normParams)
diff --git a/src/soms.jl b/src/soms.jl
index 24acd65..e7d9532 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -43,6 +43,8 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
     numDat = nrow(x)
     numCodes = nrow(codes)
 
+    kd_tree = get_kd_tree(codes)
+
     # Training:
     # 1) select random sample
     # 2) find winner
@@ -52,7 +54,8 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
     @time for s in 1:len
 
         sampl = rowSample(x)
-        winner = findWinner(codes, sampl)
+        # winner = findWinner(codes, sampl)
+        winner = findWinnerKD(kd_tree,sampl)
 
         for i in 1:numCodes
             # v = view(codes, i, :)

From 94d0bf6bb0c8e2cfd83fb4282c34abb46930572d Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 12:20:07 +0100
Subject: [PATCH 03/11] kd-tree not well suited for training because it needs
 to be reconstructed at every step

---
 src/soms.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/soms.jl b/src/soms.jl
index e7d9532..9a682f0 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -43,7 +43,6 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
     numDat = nrow(x)
     numCodes = nrow(codes)
 
-    kd_tree = get_kd_tree(codes)
 
     # Training:
     # 1) select random sample
@@ -54,8 +53,7 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
     @time for s in 1:len
 
         sampl = rowSample(x)
-        # winner = findWinner(codes, sampl)
-        winner = findWinnerKD(kd_tree,sampl)
+        winner = findWinner(codes, sampl)
 
         for i in 1:numCodes
             # v = view(codes, i, :)

From 50c4dd27ddc6050fb1e15df0a1ce5ca987ceede7 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 17:06:19 +0100
Subject: [PATCH 04/11] performance optimization for winner neuron finder
 function

---
 src/helpers.jl |  2 +-
 src/soms.jl    | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/helpers.jl b/src/helpers.jl
index 35e6b33..b46d7ba 100644
--- a/src/helpers.jl
+++ b/src/helpers.jl
@@ -97,7 +97,7 @@ function findWinnerKD(kd_tree, sampl)
   idxs[1]
 end
 
-function get_kd_tree(codes)
+function build_kd_tree(codes)
     codes_transpose = permutedims(codes,(2,1))
     KDTree(codes_transpose)
 end
diff --git a/src/soms.jl b/src/soms.jl
index 9a682f0..7776aa0 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -43,6 +43,7 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
     numDat = nrow(x)
     numCodes = nrow(codes)
 
+    # avg_monitoring_distance = 0
 
     # Training:
     # 1) select random sample
@@ -55,6 +56,9 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
         sampl = rowSample(x)
         winner = findWinner(codes, sampl)
 
+        # avg_monitoring_distance += sqeuclidean(codes[winner,:],sampl)
+        # println(avg_monitoring_distance/s)
+
         for i in 1:numCodes
             # v = view(codes, i, :)
             Δi = codes[i,:] .- sampl
@@ -77,16 +81,30 @@ end
 Return the index of the winner neuron for each training pattern
 in x (row-wise).
 """
-function visual(codes, x)
+
+function visual_brute_force(codes, x)
 
     vis = zeros(Int, nrow(x))
-    for i in 1:nrow(x)
+    @time for i in 1:nrow(x)
         vis[i] = findWinner(codes, [x[i, col] for col in 1:size(x, 2)])
     end
 
     return(vis)
 end
 
+function visual(codes, x)
+
+    kd_tree = build_kd_tree(codes)
+    vis = zeros(Int, nrow(x))
+
+    @time for i in 1:nrow(x)
+        vis[i] = findWinnerKD(kd_tree, [x[i, col] for col in 1:size(x, 2)])
+    end
+
+    return(vis)
+end
+
+
 
 """
     makePopulation(nCodes, vis)

From 46adf901481c17092d776114e33558254c5ea269 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 17:09:40 +0100
Subject: [PATCH 05/11] remove distance monitoring used for testing

---
 src/soms.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/soms.jl b/src/soms.jl
index 7776aa0..f681a9f 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -43,8 +43,6 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
     numDat = nrow(x)
     numCodes = nrow(codes)
 
-    # avg_monitoring_distance = 0
-
     # Training:
     # 1) select random sample
     # 2) find winner
@@ -56,9 +54,6 @@ function doSom(x::Array{Float64}, codes::Array{Float64},
         sampl = rowSample(x)
         winner = findWinner(codes, sampl)
 
-        # avg_monitoring_distance += sqeuclidean(codes[winner,:],sampl)
-        # println(avg_monitoring_distance/s)
-
         for i in 1:numCodes
             # v = view(codes, i, :)
             Δi = codes[i,:] .- sampl

From 08e012cb3225f78dfc7c19f096859b8fa10f9481 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 17:14:20 +0100
Subject: [PATCH 06/11] clean up

---
 src/helpers.jl | 6 ++----
 src/soms.jl    | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/helpers.jl b/src/helpers.jl
index b46d7ba..5be1958 100644
--- a/src/helpers.jl
+++ b/src/helpers.jl
@@ -1,5 +1,3 @@
-using NearestNeighbors
-
 import DataFrames: nrow, ncol
 
 """
@@ -93,8 +91,8 @@ function findWinner(cod, sampl)
 end
 
 function findWinnerKD(kd_tree, sampl)
-  idxs, _ = knn(kd_tree, sampl, 1)
-  idxs[1]
+    idxs, _ = knn(kd_tree, sampl, 1)
+    idxs[1]
 end
 
 function build_kd_tree(codes)
diff --git a/src/soms.jl b/src/soms.jl
index f681a9f..b167d67 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -99,8 +99,6 @@ function visual(codes, x)
     return(vis)
 end
 
-
-
 """
     makePopulation(nCodes, vis)
 

From 3de2aee01d6117ff817b6711601f850da8c8a207 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 17:49:30 +0100
Subject: [PATCH 07/11] refactor using pipe

---
 src/helpers.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/helpers.jl b/src/helpers.jl
index 5be1958..310ba38 100644
--- a/src/helpers.jl
+++ b/src/helpers.jl
@@ -96,8 +96,7 @@ function findWinnerKD(kd_tree, sampl)
 end
 
 function build_kd_tree(codes)
-    codes_transpose = permutedims(codes,(2,1))
-    KDTree(codes_transpose)
+    permutedims(codes,(2,1)) |> KDTree
 end
 
 """

From 32a325023c4fbdcc7af511a7fd3cd9eaa629eddf Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 17:53:21 +0100
Subject: [PATCH 08/11] bring style in line

---
 src/helpers.jl | 2 +-
 src/soms.jl    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/helpers.jl b/src/helpers.jl
index 310ba38..b75ace6 100644
--- a/src/helpers.jl
+++ b/src/helpers.jl
@@ -95,7 +95,7 @@ function findWinnerKD(kd_tree, sampl)
     idxs[1]
 end
 
-function build_kd_tree(codes)
+function buildKDTree(codes)
     permutedims(codes,(2,1)) |> KDTree
 end
 
diff --git a/src/soms.jl b/src/soms.jl
index b167d67..b29703a 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -77,7 +77,7 @@ Return the index of the winner neuron for each training pattern
 in x (row-wise).
 """
 
-function visual_brute_force(codes, x)
+function visualBruteForce(codes, x)
 
     vis = zeros(Int, nrow(x))
     @time for i in 1:nrow(x)
@@ -89,7 +89,7 @@ end
 
 function visual(codes, x)
 
-    kd_tree = build_kd_tree(codes)
+    kd_tree = buildKDTree(codes)
     vis = zeros(Int, nrow(x))
 
     @time for i in 1:nrow(x)

From 0237dd565606cf174bcbb71c38f9dcf4fa1e0cc2 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 20:22:45 +0100
Subject: [PATCH 09/11] remove file used for testing changes

---
 test/run-benchmark.jl | 69 -------------------------------------------
 1 file changed, 69 deletions(-)
 delete mode 100644 test/run-benchmark.jl

diff --git a/test/run-benchmark.jl b/test/run-benchmark.jl
deleted file mode 100644
index 5d24ff9..0000000
--- a/test/run-benchmark.jl
+++ /dev/null
@@ -1,69 +0,0 @@
-using SOM
-using Test
-#
-using DataFrames
-using RDatasets
-using Primes
-
-function generate_integer_hash_function(a,b,cardinality)
-    # cardinality must be prime number
-    # a,b: 1 ≤ a ≤ p−1  , 0 ≤ b ≤ p−1
-    @assert a < cardinality
-    @assert a > 0
-    @assert b < cardinality
-    @assert b >= 0
-    # @assert isprime(cardinality)
-    m = cardinality
-    x -> (x^a%m + b)%m  #TODO use modular exponentiation
-end
-
-function generate_vector_deterministic_pseudorandom(seed,hash_func,dimension)
-    [hash_func(x+seed) for x in 1:dimension]
-end
-
-function generate_dataset(number,dimension,max_value)
-    # dimension = 16
-    # max_value = 1013
-    params = Dict(:a => 13, :b => 17, :hash_seed => 23)
-    hash_func = generate_integer_hash_function(params[:a],params[:b],max_value)
-
-
-    seed = params[:hash_seed]
-    matrix = zeros((number,dimension))
-    for i in 1:number
-        vector = generate_vector_deterministic_pseudorandom(seed,hash_func,dimension)
-        seed = hash_func(seed+i)
-        matrix[i,:] = vector
-    end
-    convert(DataFrame,matrix)
-end
-
-
-include("testFuns.jl")
-
-
-
-function benchmarkTrain(train, topol; toroidal = false,
-                    normType = :zscore, kernel = gaussianKernel)
-
-    xdim = 10
-    ydim = 10
-
-    som = initSOM(train, xdim, ydim, norm = normType,
-                  topol = topol, toroidal = toroidal)
-    som = trainSOM(som, train, 1000, kernelFun = kernel)
-
-    ntrain = nrow(train)
-    npopul = sum(som.population)
-
-    return ntrain == npopul
-end
-
-
-train = generate_dataset(2000000,3,1013)
-
-# test hexagonal, rectangular and spherical training:
-#
-# @test benchmarkTrain(train, :hexagonal, toroidal = false)
-@time benchmarkTrain(train, :rectangular, toroidal = false)
-# @test benchmarkTrain(train, :spherical, toroidal = false)

From 014c6006058dc4c6e96365c9c6e0543ba1f3d22e Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Thu, 12 Dec 2019 22:55:28 +0100
Subject: [PATCH 10/11] dataset generator for benchmarking

---
 test/benchmark.jl | 88 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 test/benchmark.jl

diff --git a/test/benchmark.jl b/test/benchmark.jl
new file mode 100644
index 0000000..630d769
--- /dev/null
+++ b/test/benchmark.jl
@@ -0,0 +1,88 @@
+using Primes
+
+function generate_integer_hash_function(a,b,cardinality)
+    # cardinality must be prime number
+    # a,b: 1 ≤ a ≤ p−1  , 0 ≤ b ≤ p−1
+    @assert a < cardinality
+    @assert a > 0
+    @assert b < cardinality
+    @assert b >= 0
+    @assert isprime(cardinality)
+    m = cardinality
+    x -> (x^a%m + b)%m  #TODO use memory-efficient modular exponentiation for better performance
+end
+
+"""
+# function generate_vector_deterministic_pseudorandom
+
+Generates a pseudorandom vector. For the same hash function it always generates the same vector.
+If the hash function is an integer hash from a universal family, the elements of the vector will
+obey a continuous uniform distribution.
+# Arguments:
+- `seed`: A random number chosen by the user to ensure the result is deterministic
+- `hash_func`: An integer hash function sampled from a universal family
+- `dimension`: number of elements in the vector
+"""
+function generate_vector_deterministic_pseudorandom(seed,hash_func,dimension)
+    [hash_func(x+seed) for x in 1:dimension]
+end
+
+
+"""
+# function generate_dataset
+
+Generates a pseudorandom dataset that can be used for benchmarking SOMs.
+The elements in the vectors are distributed uniformly between 0 and `max_value`.
+
+# Arguments:
+- `number`: Number of rows/entries/vectors/individual iris plants in the dataset
+- `dimension`: Number of dimensions/elements in each vector
+- `max_value`: The maximum value that an element in a vector can have
+- `y`: an optional function to map the uniformly distributed elements to a desired non-uniform distribution
+- `params`: Randomness parameters chosen by user to ensure deterministic result
+"""
+function generate_dataset(number, dimension, max_value, y=(x->x), params = Dict(:a => 13, :b => 17, :hash_seed => 23, :prime => 1013))
+    hash_func = generate_integer_hash_function(params[:a],params[:b],params[:prime])
+
+    seed = params[:hash_seed]
+    matrix = zeros((number,dimension))
+    for i in 1:number
+        vector = generate_vector_deterministic_pseudorandom(seed,hash_func,dimension)
+        vector = map((x->x*max_value/params[:prime]),vector)
+        vector = map((x->(x+max_value)/2),vector)
+        vector = map(y,vector)
+        seed = hash_func(seed+i)
+        matrix[i,:] = vector
+    end
+    convert(DataFrame,matrix)
+end
+
+
+
+# Benchmarking functions
+# ======================
+
+function benchmark_init(train, topol; toroidal = false,
+                    normType = :zscore)
+
+    xdim = 10
+    ydim = 10
+
+    initSOM(train, xdim, ydim, norm = normType,
+                  topol = topol, toroidal = toroidal)
+end
+
+function benchmark_train(som,train, kernel = gaussianKernel)
+    trainSOM(som, train, 100000, kernelFun = kernel)
+end
+
+
+# Example
+# =======
+
+# iris = dataset("datasets", "iris")
+# train = iris[:,1:4]
+
+# train = generate_dataset(1000,100,1)
+# som = benchmark_init(train, :rectangular, toroidal = false)
+# som = benchmark_train(som,train)

From 8ab7b4d0a162c508c10563ba8d1af8203b1b0fe7 Mon Sep 17 00:00:00 2001
From: TimoSci <me@timosci.net>
Date: Fri, 13 Dec 2019 00:17:26 +0100
Subject: [PATCH 11/11] refactor and dry up using multiple dispatch

---
 src/helpers.jl | 2 +-
 src/soms.jl    | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/helpers.jl b/src/helpers.jl
index b75ace6..c954950 100644
--- a/src/helpers.jl
+++ b/src/helpers.jl
@@ -90,7 +90,7 @@ function findWinner(cod, sampl)
     return winner
 end
 
-function findWinnerKD(kd_tree, sampl)
+function findWinner(kd_tree::KDTree, sampl)
     idxs, _ = knn(kd_tree, sampl, 1)
     idxs[1]
 end
diff --git a/src/soms.jl b/src/soms.jl
index b29703a..14738b1 100644
--- a/src/soms.jl
+++ b/src/soms.jl
@@ -88,15 +88,8 @@ function visualBruteForce(codes, x)
 end
 
 function visual(codes, x)
-
     kd_tree = buildKDTree(codes)
-    vis = zeros(Int, nrow(x))
-
-    @time for i in 1:nrow(x)
-        vis[i] = findWinnerKD(kd_tree, [x[i, col] for col in 1:size(x, 2)])
-    end
-
-    return(vis)
+    visualBruteForce(kd_tree, x)
 end
 
 """