bcbi · vmpyr · Apr 7, 2023 · Apr 8, 2023 · Apr 10, 2023 · Apr 10, 2023
@@ -1,3 +1,4 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+Manifest.toml
@@ -1,23 +1,26 @@
 name = "ClassImbalance"
 uuid = "04a18a73-7590-580c-b363-eeca0919eb2a"
 authors = ["Paul Stey <[email protected]>", "Dilum Aluthge <[email protected]>", "Brown Center for Biomedical Informatics <[email protected]>"]
-version = "0.8.7"
+version = "0.9.0-dev"
 
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-Compat = "3"
-DataFrames = "0.20"
-Distributions = "0.21.3, 0.22"
-StatsBase = "0.32"
-julia = "1.1"
+Compat = "4.6"
+DataFrames = "1.5"
+Distributions = "0.25"
+StatsBase = "0.33"
+julia = "1.8"
 
 [extras]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"

@@ -2,11 +2,12 @@ __precompile__(true)
 
 module ClassImbalance
 
-export smote, rose
+export smote, rose, random_undersampler
 
 include("utils.jl")
 include("smote_exs.jl")
 include("ub_smote.jl")
 include("rose.jl")
+include("random_undersampler.jl")
 
 end # end module ClassImbalance
@@ -0,0 +1,69 @@
+import Random
+import StatsBase
+import DataFrames
+import Tables
+import MLUtils
+
+function random_undersampler(
+        X,
+        y::T;
+        sampling_strategy::Union{AbstractFloat, String, Dict{A, S}} = "auto",
+        random_state::Union{Nothing, S} = nothing,
+        replacement::Bool = false
+        ) where T <: AbstractVector where S <: Integer where A <: Any
+    # check if X implements getobs
+    @assert Tables.istable(X) "$X is not implementing the MLUtils.jl getobs interface"
+
+    classes = unique(y)
+    classcount = Dict(c => count(y .== c) for c in classes)
+
+    # checking classes in y
+    @assert length(classes) > 1 "$y must have more than one class"
+    # checking sampling_strategy
+    if typeof(sampling_strategy) <: String
+        @assert sampling_strategy in ["auto", "not minority", "not majority", "all", "majority"] "sampling_strategy must be one of \"auto\", \"not minority\", \"not majority\", \"all\", \"majority\""
+    elseif typeof(sampling_strategy) <: AbstractFloat
+        @assert length(classes) == 2 "sampling_strategy of type float is supported only for binary classification"
+        @assert 0 < sampling_strategy <= 1 "sampling_strategy must be between 0 and 1"
+    elseif typeof(sampling_strategy) <: Dict
+        @assert all(c in classes for c in keys(sampling_strategy)) "sampling_strategy must have keys that are classes in $y"
+        @assert all(sampling_strategy[c] <= classcount[c] for c in keys(sampling_strategy)) "sampling_strategy must have values less than or equal to current number of samples for a particular class"
+    end
+
+    sampling_strategy = undersampling_strategy!(sampling_strategy, classes, classcount)
+
+    if !isnothing(random_state)
+        rng = Random.MersenneTwister(UInt(random_state))
+    else
+        rng = Random.GLOBAL_RNG
+    end
+
+    undersampled_idx = []
+    for target_class in classes
+        if target_class in keys(sampling_strategy)
+            n_samples = sampling_strategy[target_class]
+            target_class_idx = findall(y .== target_class)
+            target_class_idx_sampled = StatsBase.sample(rng, target_class_idx, n_samples, replace=replacement)
+            append!(undersampled_idx, target_class_idx_sampled)
+        else
+            append!(undersampled_idx, findall(y .== target_class))
+        end
+    end
+
+    return MLUtils.getobs(X, undersampled_idx), MLUtils.getobs(y, undersampled_idx)
+end
+
+function random_undersampler(
+        df,
+        label::Union{Symbol, String, S};
+        sampling_strategy::Union{AbstractFloat, String, Dict{A, S}} = "auto",
+        random_state::Union{Nothing, S} = nothing,
+        replacement::Bool = false
+        ) where S <: Integer where A <: Any
+    df = DataFrames.DataFrame(df)
+    @assert label in names(df) "label or index $label does not exist in $df"
+
+    Xover, yover = random_undersampler(DataFrames.select(df, DataFrames.Not(label)), df[!, label], sampling_strategy=sampling_strategy, random_state=random_state, replacement=replacement)
+    DataFrames.join(Xover, DataFrames.DataFrame(label = yover))
+    return Xover
+end
@@ -86,3 +86,22 @@ function calculate_smote_pct_under(
     result = 100*minority_to_majority_ratio*(100+pct_over)/pct_over
     return result
 end
+
+function undersampling_strategy!(
+        sampling_strategy::String,
+        classes::T,
+        classcount::Dict{A, S},
+        ) where T <: AbstractVector where S <: Integer where A <: Any
+    mincount = minimum(values(classcount))
+    maxcount = maximum(values(classcount))
+
+    if sampling_strategy == "majority"
+        sampling_strategy = Dict(c => mincount for c in classes if classcount[c] == maxcount)
+    elseif sampling_strategy == "auto" || sampling_strategy == "not minority"
+        sampling_strategy = Dict(c => mincount for c in classes if classcount[c] != mincount)
+    elseif sampling_strategy == "not majority"
+        sampling_strategy = Dict(c => mincount for c in classes if classcount[c] != maxcount)
+    elseif sampling_strategy == "all"
+        sampling_strategy = Dict(c => mincount for c in classes)
+    end
+end