added pooled variance

Tomasz Latkowski · Tomasz Latkowski · commit beb9c3e9004f · 2018-03-02T16:28:55.000+01:00
diff --git a/methods/selection.py b/methods/selection.py
@@ -14,7 +14,7 @@ def fisher(data, num_instances: list, top_k_features=2):
     assert len(num_instances) == 2, "Fisher selection method can be performed for two-class problems."
 
     data = tf.convert_to_tensor(data)
-    _, num_features = data.get_shape().as_list()
+    num_features = data.get_shape().as_list()[-1]
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
@@ -35,7 +35,7 @@ def feature_correlation_with_class(data, num_instances: list, top_k_features=10)
     :return: the list of most significant features.
     """
     data = tf.convert_to_tensor(data)
-    _, num_features = data.get_shape().as_list()
+    num_features = data.get_shape().as_list()[-1]
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
@@ -57,7 +57,7 @@ def t_test(data, num_instances: list, top_k_features=10):
     :return: the list of most significant features.
     """
     data = tf.convert_to_tensor(data)
-    _, num_features = data.get_shape().as_list()
+    num_features = data.get_shape().as_list()[-1]
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
@@ -74,16 +74,10 @@ def t_test(data, num_instances: list, top_k_features=10):
 
 def random(data, num_instances: list, top_k_features=10):
     data = tf.convert_to_tensor(data)
-    _, num_features = data.get_shape().as_list()
+    num_features = data.get_shape().as_list()[-1]
     if top_k_features > num_features:
         top_k_features = num_features
     class1, class2 = tf.split(data, num_instances)
 
     with tf.name_scope('random_selection'):
-        mean1, std1 = tf.nn.moments(class1, axes=0)
-        mean2, std2 = tf.nn.moments(class2, axes=0)
-        t_test_coeffs = tf.abs(mean1 - mean2) / tf.sqrt(
-            tf.square(std1) / num_instances[0] + tf.square(std2) / num_instances[1])
-        selected_features = tf.nn.top_k(t_test_coeffs, k=top_k_features)
-
-    return selected_features
+        pass
diff --git a/tests/test_statistics.py b/tests/test_statistics.py
@@ -0,0 +1,23 @@
+import numpy as np
+import tensorflow as tf
+
+from utils.statistics import pooled_variance
+
+
+class TestStatistics(tf.test.TestCase):
+
+    def testPooledVariance(self):
+        with self.test_session() as test_session:
+            data = np.array([[2., 3., 4., 5.],
+                             [2., 3., 4., 5.],
+                             [2., 3., 4., 5.],
+                             [2., 3., 4., 5.]])
+            num_instances = [2, 2]
+            actual_pooled_variance = test_session.run(pooled_variance(data, num_instances))
+            correct_pooled_variance = [.0, .0, .0, .0]
+
+            self.assertAllEqual(actual_pooled_variance, correct_pooled_variance)
+
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/utils/statistics.py b/utils/statistics.py
@@ -37,8 +37,14 @@ def f_test(data, num_instances):
 def pooled_variance(data, num_instances):
     K = len(num_instances)
     n = sum(num_instances)
-
-    class1, class2 = tf.split(data, num_instances)
-    mean1, var1 = tf.nn.moments(class1, axes=0)
-    mean2, var2 = tf.nn.moments(class2, axes=0)
-    return -1
+    data = tf.convert_to_tensor(data, dtype=tf.float32)
+    split_classes = tf.split(data, num_instances)
+    vars = []
+    for i in range(len(split_classes)):
+        _, var = tf.nn.moments(split_classes[i], axes=0)
+        vars.append(var)
+
+    n_k = tf.to_float(tf.reshape(num_instances, [K, -1]))
+    stacked_var = tf.stack(vars)
+    pooled_var = tf.reduce_sum(stacked_var * (n_k - 1), axis=0) / (n - K)
+    return pooled_var