From 9a2a5aca2f09982b2aa1015d90992a6cf14563a9 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Wed, 11 Dec 2019 16:45:05 +0100 Subject: [PATCH 01/11] add benchmarking using pseudorandom data frames --- test/run-benchmark.jl | 69 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 test/run-benchmark.jl diff --git a/test/run-benchmark.jl b/test/run-benchmark.jl new file mode 100644 index 0000000..5d24ff9 --- /dev/null +++ b/test/run-benchmark.jl @@ -0,0 +1,69 @@ +using SOM +using Test +# +using DataFrames +using RDatasets +using Primes + +function generate_integer_hash_function(a,b,cardinality) + # cardinality must be prime number + # a,b: 1 ≤ a ≤ p−1 , 0 ≤ b ≤ p−1 + @assert a < cardinality + @assert a > 0 + @assert b < cardinality + @assert b >= 0 + # @assert isprime(cardinality) + m = cardinality + x -> (x^a%m + b)%m #TODO use modular exponentiation +end + +function generate_vector_deterministic_pseudorandom(seed,hash_func,dimension) + [hash_func(x+seed) for x in 1:dimension] +end + +function generate_dataset(number,dimension,max_value) + # dimension = 16 + # max_value = 1013 + params = Dict(:a => 13, :b => 17, :hash_seed => 23) + hash_func = generate_integer_hash_function(params[:a],params[:b],max_value) + + + seed = params[:hash_seed] + matrix = zeros((number,dimension)) + for i in 1:number + vector = generate_vector_deterministic_pseudorandom(seed,hash_func,dimension) + seed = hash_func(seed+i) + matrix[i,:] = vector + end + convert(DataFrame,matrix) +end + + +include("testFuns.jl") + + + +function benchmarkTrain(train, topol; toroidal = false, + normType = :zscore, kernel = gaussianKernel) + + xdim = 10 + ydim = 10 + + som = initSOM(train, xdim, ydim, norm = normType, + topol = topol, toroidal = toroidal) + som = trainSOM(som, train, 1000, kernelFun = kernel) + + ntrain = nrow(train) + npopul = sum(som.population) + + return ntrain == npopul +end + + +train = generate_dataset(2000000,3,1013) + +# test hexagonal, rectangular and spherical training: +# +# @test benchmarkTrain(train, :hexagonal, toroidal = false) +@time benchmarkTrain(train, :rectangular, toroidal = false) +# @test benchmarkTrain(train, :spherical, toroidal = false) From 2dab4598348c69fd04bb1d52a9471d10f4573fca Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 01:11:19 +0100 Subject: [PATCH 02/11] add kd-tree nearest neighbor search for faster performance in some cases --- src/SOM.jl | 1 + src/helpers.jl | 11 +++++++++++ src/soms.jl | 5 ++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/SOM.jl b/src/SOM.jl index 2b738b6..20029a3 100644 --- a/src/SOM.jl +++ b/src/SOM.jl @@ -15,6 +15,7 @@ using Distances using ProgressMeter using StatsBase using Distributions +using NearestNeighbors #using TensorToolbox using LinearAlgebra # if VERSION < v"0.7.0-DEV.5183" diff --git a/src/helpers.jl b/src/helpers.jl index 8e365eb..35e6b33 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -1,3 +1,5 @@ +using NearestNeighbors + import DataFrames: nrow, ncol """ @@ -90,6 +92,15 @@ function findWinner(cod, sampl) return winner end +function findWinnerKD(kd_tree, sampl) + idxs, _ = knn(kd_tree, sampl, 1) + idxs[1] +end + +function get_kd_tree(codes) + codes_transpose = permutedims(codes,(2,1)) + KDTree(codes_transpose) +end """ normTrainData(x, normParams) diff --git a/src/soms.jl b/src/soms.jl index 24acd65..e7d9532 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -43,6 +43,8 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, numDat = nrow(x) numCodes = nrow(codes) + kd_tree = get_kd_tree(codes) + # Training: # 1) select random sample # 2) find winner @@ -52,7 +54,8 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, @time for s in 1:len sampl = rowSample(x) - winner = findWinner(codes, sampl) + # winner = findWinner(codes, sampl) + winner = findWinnerKD(kd_tree,sampl) for i in 1:numCodes # v = view(codes, i, :) From 94d0bf6bb0c8e2cfd83fb4282c34abb46930572d Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 12:20:07 +0100 Subject: [PATCH 03/11] kd-tree not well suited for training because it needs to be reconstructed at every step --- src/soms.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/soms.jl b/src/soms.jl index e7d9532..9a682f0 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -43,7 +43,6 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, numDat = nrow(x) numCodes = nrow(codes) - kd_tree = get_kd_tree(codes) # Training: # 1) select random sample @@ -54,8 +53,7 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, @time for s in 1:len sampl = rowSample(x) - # winner = findWinner(codes, sampl) - winner = findWinnerKD(kd_tree,sampl) + winner = findWinner(codes, sampl) for i in 1:numCodes # v = view(codes, i, :) From 50c4dd27ddc6050fb1e15df0a1ce5ca987ceede7 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 17:06:19 +0100 Subject: [PATCH 04/11] performance optimization for winner neuron finder function --- src/helpers.jl | 2 +- src/soms.jl | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/helpers.jl b/src/helpers.jl index 35e6b33..b46d7ba 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -97,7 +97,7 @@ function findWinnerKD(kd_tree, sampl) idxs[1] end -function get_kd_tree(codes) +function build_kd_tree(codes) codes_transpose = permutedims(codes,(2,1)) KDTree(codes_transpose) end diff --git a/src/soms.jl b/src/soms.jl index 9a682f0..7776aa0 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -43,6 +43,7 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, numDat = nrow(x) numCodes = nrow(codes) + # avg_monitoring_distance = 0 # Training: # 1) select random sample @@ -55,6 +56,9 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, sampl = rowSample(x) winner = findWinner(codes, sampl) + # avg_monitoring_distance += sqeuclidean(codes[winner,:],sampl) + # println(avg_monitoring_distance/s) + for i in 1:numCodes # v = view(codes, i, :) Δi = codes[i,:] .- sampl @@ -77,16 +81,30 @@ end Return the index of the winner neuron for each training pattern in x (row-wise). """ -function visual(codes, x) + +function visual_brute_force(codes, x) vis = zeros(Int, nrow(x)) - for i in 1:nrow(x) + @time for i in 1:nrow(x) vis[i] = findWinner(codes, [x[i, col] for col in 1:size(x, 2)]) end return(vis) end +function visual(codes, x) + + kd_tree = build_kd_tree(codes) + vis = zeros(Int, nrow(x)) + + @time for i in 1:nrow(x) + vis[i] = findWinnerKD(kd_tree, [x[i, col] for col in 1:size(x, 2)]) + end + + return(vis) +end + + """ makePopulation(nCodes, vis) From 46adf901481c17092d776114e33558254c5ea269 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 17:09:40 +0100 Subject: [PATCH 05/11] remove distance monitoring used for testing --- src/soms.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/soms.jl b/src/soms.jl index 7776aa0..f681a9f 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -43,8 +43,6 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, numDat = nrow(x) numCodes = nrow(codes) - # avg_monitoring_distance = 0 - # Training: # 1) select random sample # 2) find winner @@ -56,9 +54,6 @@ function doSom(x::Array{Float64}, codes::Array{Float64}, sampl = rowSample(x) winner = findWinner(codes, sampl) - # avg_monitoring_distance += sqeuclidean(codes[winner,:],sampl) - # println(avg_monitoring_distance/s) - for i in 1:numCodes # v = view(codes, i, :) Δi = codes[i,:] .- sampl From 08e012cb3225f78dfc7c19f096859b8fa10f9481 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 17:14:20 +0100 Subject: [PATCH 06/11] clean up --- src/helpers.jl | 6 ++---- src/soms.jl | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/helpers.jl b/src/helpers.jl index b46d7ba..5be1958 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -1,5 +1,3 @@ -using NearestNeighbors - import DataFrames: nrow, ncol """ @@ -93,8 +91,8 @@ function findWinner(cod, sampl) end function findWinnerKD(kd_tree, sampl) - idxs, _ = knn(kd_tree, sampl, 1) - idxs[1] + idxs, _ = knn(kd_tree, sampl, 1) + idxs[1] end function build_kd_tree(codes) diff --git a/src/soms.jl b/src/soms.jl index f681a9f..b167d67 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -99,8 +99,6 @@ function visual(codes, x) return(vis) end - - """ makePopulation(nCodes, vis) From 3de2aee01d6117ff817b6711601f850da8c8a207 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 17:49:30 +0100 Subject: [PATCH 07/11] refactor using pipe --- src/helpers.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/helpers.jl b/src/helpers.jl index 5be1958..310ba38 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -96,8 +96,7 @@ function findWinnerKD(kd_tree, sampl) end function build_kd_tree(codes) - codes_transpose = permutedims(codes,(2,1)) - KDTree(codes_transpose) + permutedims(codes,(2,1)) |> KDTree end """ From 32a325023c4fbdcc7af511a7fd3cd9eaa629eddf Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 17:53:21 +0100 Subject: [PATCH 08/11] bring style in line --- src/helpers.jl | 2 +- src/soms.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/helpers.jl b/src/helpers.jl index 310ba38..b75ace6 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -95,7 +95,7 @@ function findWinnerKD(kd_tree, sampl) idxs[1] end -function build_kd_tree(codes) +function buildKDTree(codes) permutedims(codes,(2,1)) |> KDTree end diff --git a/src/soms.jl b/src/soms.jl index b167d67..b29703a 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -77,7 +77,7 @@ Return the index of the winner neuron for each training pattern in x (row-wise). """ -function visual_brute_force(codes, x) +function visualBruteForce(codes, x) vis = zeros(Int, nrow(x)) @time for i in 1:nrow(x) @@ -89,7 +89,7 @@ end function visual(codes, x) - kd_tree = build_kd_tree(codes) + kd_tree = buildKDTree(codes) vis = zeros(Int, nrow(x)) @time for i in 1:nrow(x) From 0237dd565606cf174bcbb71c38f9dcf4fa1e0cc2 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 20:22:45 +0100 Subject: [PATCH 09/11] remove file used for testing changes --- test/run-benchmark.jl | 69 ------------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 test/run-benchmark.jl diff --git a/test/run-benchmark.jl b/test/run-benchmark.jl deleted file mode 100644 index 5d24ff9..0000000 --- a/test/run-benchmark.jl +++ /dev/null @@ -1,69 +0,0 @@ -using SOM -using Test -# -using DataFrames -using RDatasets -using Primes - -function generate_integer_hash_function(a,b,cardinality) - # cardinality must be prime number - # a,b: 1 ≤ a ≤ p−1 , 0 ≤ b ≤ p−1 - @assert a < cardinality - @assert a > 0 - @assert b < cardinality - @assert b >= 0 - # @assert isprime(cardinality) - m = cardinality - x -> (x^a%m + b)%m #TODO use modular exponentiation -end - -function generate_vector_deterministic_pseudorandom(seed,hash_func,dimension) - [hash_func(x+seed) for x in 1:dimension] -end - -function generate_dataset(number,dimension,max_value) - # dimension = 16 - # max_value = 1013 - params = Dict(:a => 13, :b => 17, :hash_seed => 23) - hash_func = generate_integer_hash_function(params[:a],params[:b],max_value) - - - seed = params[:hash_seed] - matrix = zeros((number,dimension)) - for i in 1:number - vector = generate_vector_deterministic_pseudorandom(seed,hash_func,dimension) - seed = hash_func(seed+i) - matrix[i,:] = vector - end - convert(DataFrame,matrix) -end - - -include("testFuns.jl") - - - -function benchmarkTrain(train, topol; toroidal = false, - normType = :zscore, kernel = gaussianKernel) - - xdim = 10 - ydim = 10 - - som = initSOM(train, xdim, ydim, norm = normType, - topol = topol, toroidal = toroidal) - som = trainSOM(som, train, 1000, kernelFun = kernel) - - ntrain = nrow(train) - npopul = sum(som.population) - - return ntrain == npopul -end - - -train = generate_dataset(2000000,3,1013) - -# test hexagonal, rectangular and spherical training: -# -# @test benchmarkTrain(train, :hexagonal, toroidal = false) -@time benchmarkTrain(train, :rectangular, toroidal = false) -# @test benchmarkTrain(train, :spherical, toroidal = false) From 014c6006058dc4c6e96365c9c6e0543ba1f3d22e Mon Sep 17 00:00:00 2001 From: TimoSci Date: Thu, 12 Dec 2019 22:55:28 +0100 Subject: [PATCH 10/11] dataset generator for benchmarking --- test/benchmark.jl | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 test/benchmark.jl diff --git a/test/benchmark.jl b/test/benchmark.jl new file mode 100644 index 0000000..630d769 --- /dev/null +++ b/test/benchmark.jl @@ -0,0 +1,88 @@ +using Primes + +function generate_integer_hash_function(a,b,cardinality) + # cardinality must be prime number + # a,b: 1 ≤ a ≤ p−1 , 0 ≤ b ≤ p−1 + @assert a < cardinality + @assert a > 0 + @assert b < cardinality + @assert b >= 0 + @assert isprime(cardinality) + m = cardinality + x -> (x^a%m + b)%m #TODO use memory-efficient modular exponentiation for better performance +end + +""" +# function generate_vector_deterministic_pseudorandom + +Generates a pseudorandom vector. For the same hash function it always generates the same vector. +If the hash function is an integer hash from a universal family, the elements of the vector will +obey a continuous uniform distribution. +# Arguments: +- `seed`: A random number chosen by the user to ensure the result is deterministic +- `hash_func`: An integer hash function sampled from a universal family +- `dimension`: number of elements in the vector +""" +function generate_vector_deterministic_pseudorandom(seed,hash_func,dimension) + [hash_func(x+seed) for x in 1:dimension] +end + + +""" +# function generate_dataset + +Generates a pseudorandom dataset that can be used for benchmarking SOMs. +The elements in the vectors are distributed uniformly between 0 and `max_value`. + +# Arguments: +- `number`: Number of rows/entries/vectors/individual iris plants in the dataset +- `dimension`: Number of dimensions/elements in each vector +- `max_value`: The maximum value that an element in a vector can have +- `y`: an optional function to map the uniformly distributed elements to a desired non-uniform distribution +- `params`: Randomness parameters chosen by user to ensure deterministic result +""" +function generate_dataset(number, dimension, max_value, y=(x->x), params = Dict(:a => 13, :b => 17, :hash_seed => 23, :prime => 1013)) + hash_func = generate_integer_hash_function(params[:a],params[:b],params[:prime]) + + seed = params[:hash_seed] + matrix = zeros((number,dimension)) + for i in 1:number + vector = generate_vector_deterministic_pseudorandom(seed,hash_func,dimension) + vector = map((x->x*max_value/params[:prime]),vector) + vector = map((x->(x+max_value)/2),vector) + vector = map(y,vector) + seed = hash_func(seed+i) + matrix[i,:] = vector + end + convert(DataFrame,matrix) +end + + + +# Benchmarking functions +# ====================== + +function benchmark_init(train, topol; toroidal = false, + normType = :zscore) + + xdim = 10 + ydim = 10 + + initSOM(train, xdim, ydim, norm = normType, + topol = topol, toroidal = toroidal) +end + +function benchmark_train(som,train, kernel = gaussianKernel) + trainSOM(som, train, 100000, kernelFun = kernel) +end + + +# Example +# ======= + +# iris = dataset("datasets", "iris") +# train = iris[:,1:4] + +# train = generate_dataset(1000,100,1) +# som = benchmark_init(train, :rectangular, toroidal = false) +# som = benchmark_train(som,train) From 8ab7b4d0a162c508c10563ba8d1af8203b1b0fe7 Mon Sep 17 00:00:00 2001 From: TimoSci Date: Fri, 13 Dec 2019 00:17:26 +0100 Subject: [PATCH 11/11] refactor and dry up using multiple dispatch --- src/helpers.jl | 2 +- src/soms.jl | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/helpers.jl b/src/helpers.jl index b75ace6..c954950 100644 --- a/src/helpers.jl +++ b/src/helpers.jl @@ -90,7 +90,7 @@ function findWinner(cod, sampl) return winner end -function findWinnerKD(kd_tree, sampl) +function findWinner(kd_tree::KDTree, sampl) idxs, _ = knn(kd_tree, sampl, 1) idxs[1] end diff --git a/src/soms.jl b/src/soms.jl index b29703a..14738b1 100644 --- a/src/soms.jl +++ b/src/soms.jl @@ -88,15 +88,8 @@ function visualBruteForce(codes, x) end function visual(codes, x) - kd_tree = buildKDTree(codes) - vis = zeros(Int, nrow(x)) - - @time for i in 1:nrow(x) - vis[i] = findWinnerKD(kd_tree, [x[i, col] for col in 1:size(x, 2)]) - end - - return(vis) + visualBruteForce(kd_tree, x) end """