-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweights.py
206 lines (163 loc) · 9.66 KB
/
weights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# -*- coding: utf-8 -*-
import math
def get_approximate_maximum_d_value_from_three_by_two_contingency_table(three_by_two_contingency_table,
prediction_list_stats,
log_of_factorial_of_prediction_list_stats,
returnlog=False):
n_pz = prediction_list_stats[1] - (three_by_two_contingency_table[1] + three_by_two_contingency_table[2])
n_mz = prediction_list_stats[2] - (three_by_two_contingency_table[3] + three_by_two_contingency_table[4])
n_zz = prediction_list_stats[3] - (three_by_two_contingency_table[5] + three_by_two_contingency_table[6])
three_by_three_contingency_table = [three_by_two_contingency_table, n_pz, n_mz, n_zz]
weight = calculate_weight_given_values_in_three_by_three_contingency_table(three_by_three_contingency_table,
log_of_factorial_of_prediction_list_stats,
returnlog)
return weight
def get_max_D_value_for_a_three_by_two_family(r_p, r_m, r_z, n_p, n_m, predictionListStats,
logOfFactorialOfPredictionListStats, returnlog=False):
total = n_p + n_m
# Compute the values of n++, n+-, n-+, n--, n0+, and n0- that maximise the D-value; these correspond to the
# following formula: n_ab is approximately equal to q_a*n_b/T, where T = n+ + n-, and a,b are either + or -.
# See Assessing statistical significance in causal graphs, page 7 - the formula is not stated explicitly
# but follows from the logic of algorithm 1a).
if total > 0:
n_pp = math.ceil(r_p * n_p / total)
n_pm = r_p - n_pp
n_mp = math.ceil(r_m * n_p / total)
# Check this rounding produces a valid combination i.e. n++ + n-+ <= n+
if (n_mp + n_pp) > n_p:
n_mp = math.floor(r_m * n_p / total)
n_mm = r_m - n_mp
n_zp = n_p - (n_pp + n_mp)
n_zm = n_m - (n_pm + n_mm)
else:
n_pp = 0
n_pm = 0
n_mp = 0
n_mm = 0
n_zp = 0
n_zm = 0
three_two_contingency_table = [n_pp, n_pm, n_mp, n_mm, n_zp, n_zm]
maximum_d_fam_value = get_approximate_maximum_d_value_from_three_by_two_contingency_table(
three_two_contingency_table,
predictionListStats,
logOfFactorialOfPredictionListStats,
returnlog)
return maximum_d_fam_value
def calculate_weight_given_values_in_three_by_three_contingency_table(
three_x_three_contingency_table,
log_factorial_prediction_stats,
returnlog=False):
"""Given the values in the three by three contingency table and the values of the number of positive/negative/
non-significant predictions (q+, q-, q0) this function calculates the D-value (or weight).
:param three_x_three_contingency_table a 3x3 contingency table
:param log_factorial_prediction_stats log of Factorial of prediction statistics
:param returnlog should the result be returned as a log value. Default is FALSE.
:return a D-value (or weight)
"""
# The D value is defined as: (q+! / (n++! * n+-! * n+0!)) * (q-! / (n-+! * n--! * n-0!)) * (q0! / (n0+! * n0-! * n00!))
if returnlog:
return sum(log_factorial_prediction_stats) - sum(math.log(math.factorial(three_x_three_contingency_table)))
else:
return (
math.exp(
sum(log_factorial_prediction_stats) - sum(math.log(math.factorial(three_x_three_contingency_table)))))
def get_weights_above_hypothesis_score_for_a_three_by_two_table(
weights, r_p, r_m, r_z, n_p, n_m, prediction_list_stats, experimentalDataStats,
log_of_factorial_of_prediction_list_stats,
hypothesisScore, logepsDMax, logDMax):
""" Finds the D-Values (weights) from any 3x2 contingency tables that have a score above and including the
hypothesis score. It also calculates the total weight, and returns a 2x1 vector of the two values.
The ratio of these values is the p-value
:param weights:
:param r_p:
:param r_m:
:param r_z:
:param n_p:
:param n_m:
:param prediction_list_stats:
:param experimentalDataStats:
:param log_of_factorial_of_prediction_list_stats:
:param hypothesisScore:
:param logepsDMax:
:param logDMax:
:return:
"""
log_approx_d_family_max = get_max_D_value_for_a_three_by_two_family(r_p, r_m, r_z, n_p, n_m, prediction_list_stats,
log_of_factorial_of_prediction_list_stats, True)
# To reduce the overall runtime of the algorithm, the approximate maxDValue for the family is calculated, rather than the actual.
if log_approx_d_family_max > logepsDMax:
# Calculate the values of n++ and n-+ that were used to calculate approxDFamilyMax
# t = n_p + n_m ( the sum of all the values in the 3x2 submatrix)
total = n_p + n_m
if total > 0:
n_pp = math.ceil(r_p * n_p / total)
else:
n_pp = 0
# Iterate over possible values of n++, starting from the one calculated above. Since the values decrease monotonically either side of this value,
# we start at this value and iterate in both directions until the D value is less than a given threshold. This will give us the non-neglible parts
# of the distribution.
# An upper bound for possible values of n++ is the minimum of r+ and n+
for m_pp in range(n_pp, min(r_p, n_p)):
# We still need to define one more value in the 3x2 matrix to determine all the rest n+- is bounded by the minumum of r- and n+
for m_mp in range(0, min(r_m, n_p)):
# n+-
m_pm = r_p - m_pp
# n--
m_mm = r_m - m_mp
# n0+
m_zp = n_p - (m_pp + m_mp)
# n0-
m_zm = n_m - (m_pm + m_mm)
contingencyTableValues = [m_pp, m_pm, m_mp, m_mm, m_zp, m_zm]
m_pz = prediction_list_stats[1] - r_p
m_mz = prediction_list_stats[2] - r_m
m_zz = prediction_list_stats[3] - r_z
# None of these values can be less than zero
if min(contingencyTableValues) < 0:
continue
three_by_three_contingency_table = [m_pp, m_pm, m_pz, m_mp, m_mm, m_mz, m_zp, m_zm, m_zz]
logDValue = sum(log_of_factorial_of_prediction_list_stats) - sum(
math.log(math.factorial(three_by_three_contingency_table)))
# Only need to compute the score if the D-value is non-neglible
if logDValue <= logepsDMax:
continue
weight_contrib = math.exp(logDValue - logDMax)
weights[2] = weights[2] + weight_contrib
# Rather than just calculate the DValue by taking the exponential of logDvalue, we first subtract a constant. This means when we take the
# exponential we get the DValue divided by a different constant, but this cancels out when we take ratios later when computing the p-value
if m_pp + m_mm - (m_pm + m_mp) >= hypothesisScore:
weights[1] = weights[1] + weight_contrib
# The loop below does exactly the same thing as the one above; it covers the cases that are not done in the loop above. A lower bound for possible
# values of n++ is 0 To avoid n_pp become negative or double counting when n_pp = 0, we add the following if statement
if (n_pp > 0):
for m_pp in range(n_pp - 1, 0):
# Iterate over all values of m_mp
for m_mp in range(0, min(r_m, n_p)):
# n+-
m_pm = r_p - m_pp
# n--
m_mm = r_m - m_mp
# n0+
m_zp = n_p - (m_pp + m_mp)
# n0-
m_zm = n_m - (m_pm + m_mm)
contingencyTableValues = [m_pp, m_pm, m_mp, m_mm, m_zp, m_zm]
m_pz = prediction_list_stats[1] - r_p
m_mz = prediction_list_stats[2] - r_m
m_zz = prediction_list_stats[3] - r_z
# None of these values can be less than zero
if min(contingencyTableValues) < 0:
continue
three_by_three_contingency_table = [m_pp, m_pm, m_pz, m_mp, m_mm, m_mz, m_zp, m_zm, m_zz]
logDValue = math.log(math.factorial(log_of_factorial_of_prediction_list_stats)) - math.log(
math.factorial(three_by_three_contingency_table))
# Only need to compute the score if the D-value is non-neglible
if logDValue <= logepsDMax:
continue
# Rather than just calculate the DValue by taking the exponential of logDvalue, we first subtract a constant. This means when we take the
# exponential we get the DValue divided by a different constant, but this cancels out when we take ratios later when computing the p-value
weight_contrib = math.exp(logDValue - logDMax)
weights[2] = weights[2] + weight_contrib
if (m_pp + m_mm - (m_pm + m_mp) >= hypothesisScore):
weights[1] = weights[1] + weight_contrib
return weights