diff --git a/metadpy/bayesian.py b/metadpy/bayesian.py index 95b3606..4d89023 100644 --- a/metadpy/bayesian.py +++ b/metadpy/bayesian.py @@ -275,12 +275,17 @@ def hmetad( # Group level elif (within is None) & (between is None) & (subject is not None): - # pymcData = preprocess_group( - # data, subject, stimuli, accuracy, confidence, nRatings - # ) + pymcData = preprocess_group( + data, subject, stimuli, accuracy, confidence, nRatings + ) + + from group_level_pymc import hmetad_groupLevel - raise ValueError( - "Invalid backend provided - This model is not implemented yet." + model_output = hmetad_groupLevel( + pymcData, + sample_model=sample_model, + num_chains=num_chains, + num_samples=num_samples, ) ################### @@ -409,90 +414,89 @@ def extractParameters( return data -# TODO: when implementing group level fitting, the following wrapper will be usefull. -# def preprocess_group( -# data: pd.DataFrame, -# subject: str, -# stimuli: str, -# accuracy: str, -# confidence: str, -# nRatings: int, -# ) -> Dict: -# """Preprocess group data. +def preprocess_group( + data: pd.DataFrame, + subject: str, + stimuli: str, + accuracy: str, + confidence: str, + nRatings: int, +) -> Dict: + """Preprocess group data. -# Parameters -# ---------- -# data : :py:class:`pandas.DataFrame` or None -# Dataframe. Note that this function can also directly be used as a -# Pandas method, in which case this argument is no longer needed. -# subject : string or None -# Name of column containing the subject identifier (only required if a -# within-subject or a between-subject factor is provided). -# stimuli : string or None -# Name of the column containing the stimuli. -# accuracy : string or None -# Name of the columns containing the accuracy. -# confidence : string or None -# Name of the column containing the confidence ratings. -# nRatings : int or None -# Number of discrete ratings. If a continuous rating scale was used, and -# the number of unique ratings does not match `nRatings`, will convert to -# discrete ratings using :py:func:`metadpy.utils.discreteRatings`. + Parameters + ---------- + data : :py:class:`pandas.DataFrame` or None + Dataframe. Note that this function can also directly be used as a + Pandas method, in which case this argument is no longer needed. + subject : string or None + Name of column containing the subject identifier (only required if a + within-subject or a between-subject factor is provided). + stimuli : string or None + Name of the column containing the stimuli. + accuracy : string or None + Name of the columns containing the accuracy. + confidence : string or None + Name of the column containing the confidence ratings. + nRatings : int or None + Number of discrete ratings. If a continuous rating scale was used, and + the number of unique ratings does not match `nRatings`, will convert to + discrete ratings using :py:func:`metadpy.utils.discreteRatings`. -# Return -# ------ -# pymcData : Dict + Return + ------ + pymcData : Dict -# """ -# pymcData = { -# "d1": [], -# "c1": [], -# "nSubj": data[subject].nunique(), -# "subID": np.arange(data[subject].nunique(), dtype="int"), -# "hits": [], -# "falsealarms": [], -# "s": [], -# "n": [], -# "counts": [], -# "nRatings": nRatings, -# "Tol": 1e-05, -# "cr": [], -# "m": [], -# } + """ + pymcData = { + "d1": [], + "c1": [], + "nSubj": data[subject].nunique(), + "subID": np.arange(data[subject].nunique(), dtype="int"), + "hits": [], + "falsealarms": [], + "s": [], + "n": [], + "counts": [], + "nRatings": nRatings, + "Tol": 1e-05, + "cr": [], + "m": [], + } -# for sub in data[subject].unique(): -# nR_S1, nR_S2 = trials2counts( -# data=data[data[subject] == sub], -# stimuli=stimuli, -# accuracy=accuracy, -# confidence=confidence, -# nRatings=nRatings, -# ) - -# this_data = extractParameters(nR_S1, nR_S2) -# pymcData["d1"].append(this_data["d1"]) -# pymcData["c1"].append(this_data["c1"]) -# pymcData["s"].append(this_data["S"]) -# pymcData["n"].append(this_data["N"]) -# pymcData["m"].append(this_data["M"]) -# pymcData["cr"].append(this_data["CR"]) -# pymcData["counts"].append(this_data["counts"]) -# pymcData["hits"].append(this_data["H"]) -# pymcData["falsealarms"].append(this_data["FA"]) - -# pymcData["d1"] = np.array(pymcData["d1"], dtype="float") -# pymcData["c1"] = np.array(pymcData["c1"], dtype="float") -# pymcData["s"] = np.array(pymcData["s"], dtype="int") -# pymcData["n"] = np.array(pymcData["n"], dtype="int") -# pymcData["m"] = np.array(pymcData["m"], dtype="int") -# pymcData["cr"] = np.array(pymcData["cr"], dtype="int") -# pymcData["counts"] = np.array(pymcData["counts"], dtype="int") -# pymcData["hits"] = np.array(pymcData["hits"], dtype="int") -# pymcData["falsealarms"] = np.array(pymcData["falsealarms"], dtype="int") -# pymcData["nSubj"] = data[subject].nunique() -# pymcData["subID"] = np.arange(pymcData["nSubj"], dtype="int") + for sub in data[subject].unique(): + nR_S1, nR_S2 = trials2counts( + data=data[data[subject] == sub], + stimuli=stimuli, + accuracy=accuracy, + confidence=confidence, + nRatings=nRatings, + ) -# return pymcData + this_data = extractParameters(nR_S1, nR_S2) + pymcData["d1"].append(this_data["d1"]) + pymcData["c1"].append(this_data["c1"]) + pymcData["s"].append(this_data["S"]) + pymcData["n"].append(this_data["N"]) + pymcData["m"].append(this_data["M"]) + pymcData["cr"].append(this_data["CR"]) + pymcData["counts"].append(this_data["counts"]) + pymcData["hits"].append(this_data["H"]) + pymcData["falsealarms"].append(this_data["FA"]) + + pymcData["d1"] = np.array(pymcData["d1"], dtype="float") + pymcData["c1"] = np.array(pymcData["c1"], dtype="float") + pymcData["s"] = np.array(pymcData["s"], dtype="int") + pymcData["n"] = np.array(pymcData["n"], dtype="int") + pymcData["m"] = np.array(pymcData["m"], dtype="int") + pymcData["cr"] = np.array(pymcData["cr"], dtype="int") + pymcData["counts"] = np.array(pymcData["counts"], dtype="int") + pymcData["hits"] = np.array(pymcData["hits"], dtype="int") + pymcData["falsealarms"] = np.array(pymcData["falsealarms"], dtype="int") + pymcData["nSubj"] = data[subject].nunique() + pymcData["subID"] = np.arange(pymcData["nSubj"], dtype="int") + + return pymcData # def preprocess_rm1way( diff --git a/metadpy/models/group_level_pymc.py b/metadpy/models/group_level_pymc.py new file mode 100644 index 0000000..2da4810 --- /dev/null +++ b/metadpy/models/group_level_pymc.py @@ -0,0 +1,227 @@ +# Author: Nicolas Legrand + +import pytensor.tensor as pt +import numpy as np +from pymc import Binomial, Deterministic, HalfNormal, Model, Multinomial, Normal, sample + + +def phi(x): + """Cumulative normal distribution.""" + return 0.5 + 0.5 * pt.erf(x / pt.sqrt(2)) + + +def hmetad_groupLevel( + data, sample_model=True, num_samples: int = 1000, num_chains: int = 4, **kwargs +): + """Hierarchical Bayesian modeling of meta-d' (group level). + + This is an internal function. The group level model must be called using + :py:func:`metadpy.bayesian.hmetad`. + + Parameters + ---------- + data : dict + Response data. + sample_model : boolean + If `False`, only the model is returned without sampling. + num_samples : int + The number of samples per chains to draw (defaults to `1000`). + num_chains : int + The number of chains (defaults to `4`). + **kwargs : keyword arguments + All keyword arguments are passed to `func::pymc.sampling.sample`. + + Returns + ------- + model : :py:class:`pymc.Model` instance + The pymc model. Encapsulates the variables and likelihood factors. + trace : :py:class:`pymc.backends.base.MultiTrace` or + :py:class:`arviz.InferenceData` + A `MultiTrace` or `ArviZ InferenceData` object that contains the samples. + + References + ---------- + .. [#] Fleming, S.M. (2017) HMeta-d: hierarchical Bayesian estimation of + metacognitive efficiency from confidence ratings, Neuroscience of Consciousness, + 3(1) nix007, https://doi.org/10.1093/nc/nix007 + + """ + nRatings = data["nRatings"] + nSubj = data["nSubj"] + + with Model() as model: + + # Group-level hyperpriors + mu_c1 = Normal("mu_c1", mu=0.0, sigma=1) + sigma_c1 = HalfNormal("sigma_c1", sigma=1) + + mu_d1 = Normal("mu_d1", mu=0.0, sigma=1) + sigma_d1 = HalfNormal("sigma_d1", sigma=1) + + mu_logMratio = Normal("mu_logMratio", mu=0.0, sigma=1) + sigma_logMratio = HalfNormal("sigma_logMratio", sigma=1) + + # Group-level hyperpriors for criteria + mu_cS1_hn = HalfNormal("mu_cS1_hn", sigma=1, shape=nRatings - 1) + sigma_cS1_hn = HalfNormal("sigma_cS1_hn", sigma=1) + + mu_cS2_hn = HalfNormal("mu_cS2_hn", sigma=1, shape=nRatings - 1) + sigma_cS2_hn = HalfNormal("sigma_cS2_hn", sigma=1) + + # Subject-level parameters - each drawn from group distribution + c1 = Normal("c1", mu=mu_c1, sigma=sigma_c1, shape=nSubj) + d1 = Normal("d1", mu=mu_d1, sigma=sigma_d1, shape=nSubj) + logMratio = Normal("logMratio", mu=mu_logMratio, sigma=sigma_logMratio, shape=nSubj) + + # Transform to meta_d + meta_d = Deterministic("meta_d", d1 * pt.exp(logMratio)) + + # Subject-level criteria + cS1_hn = HalfNormal("cS1_hn", sigma=sigma_cS1_hn, shape=(nSubj, nRatings - 1)) + cS2_hn = HalfNormal("cS2_hn", sigma=sigma_cS2_hn, shape=(nSubj, nRatings - 1)) + + # For each subject, fit the exact same model as the original subject-level model + for s in range(nSubj): + # Get subject-specific parameters + c1_s = c1[s] + d1_s = d1[s] + meta_d_s = meta_d[s] + + # Subject-specific criteria using the exact same pattern as original + cS1_s = Deterministic(f"cS1_{s}", pt.sort(-cS1_hn[s, :]) + (c1_s - data["Tol"])) + cS2_s = Deterministic(f"cS2_{s}", pt.sort(cS2_hn[s, :]) + (c1_s - data["Tol"])) + + # TYPE 1 SDT BINOMIAL MODEL (exact same as original) + h_s = phi(d1_s / 2 - c1_s) + f_s = phi(-d1_s / 2 - c1_s) + H_s = Binomial(f"H_{s}", data["s"][s], h_s, observed=data["hits"][s]) + FA_s = Binomial(f"FA_{s}", data["n"][s], f_s, observed=data["falsealarms"][s]) + + # TYPE 2 SDT MODEL - exact same calculations as original + # Means of SDT distributions + S2mu = pt.flatten(meta_d_s / 2, 1) + S1mu = pt.flatten(-meta_d_s / 2, 1) + + # Calculate normalisation constants + C_area_rS1 = phi(c1_s - S1mu) + I_area_rS1 = phi(c1_s - S2mu) + C_area_rS2 = 1 - phi(c1_s - S2mu) + I_area_rS2 = 1 - phi(c1_s - S1mu) + + # Get nC_rS1 probs - exact same pattern as original + nC_rS1 = phi(cS1_s - S1mu) / C_area_rS1 + nC_rS1 = Deterministic( + f"nC_rS1_{s}", + pt.concatenate( + ( + [ + phi(cS1_s[0] - S1mu) / C_area_rS1, + nC_rS1[1:] - nC_rS1[:-1], + ( + (phi(c1_s - S1mu) - phi(cS1_s[(nRatings - 2)] - S1mu)) + / C_area_rS1 + ), + ] + ), + axis=0, + ), + ) + + # Get nI_rS2 probs - exact same pattern as original + nI_rS2 = (1 - phi(cS2_s - S1mu)) / I_area_rS2 + nI_rS2 = Deterministic( + f"nI_rS2_{s}", + pt.concatenate( + ( + [ + ((1 - phi(c1_s - S1mu)) - (1 - phi(cS2_s[0] - S1mu))) / I_area_rS2, + nI_rS2[:-1] - (1 - phi(cS2_s[1:] - S1mu)) / I_area_rS2, + (1 - phi(cS2_s[nRatings - 2] - S1mu)) / I_area_rS2, + ] + ), + axis=0, + ), + ) + + # Get nI_rS1 probs - exact same pattern as original + nI_rS1 = (-phi(cS1_s - S2mu)) / I_area_rS1 + nI_rS1 = Deterministic( + f"nI_rS1_{s}", + pt.concatenate( + ( + [ + phi(cS1_s[0] - S2mu) / I_area_rS1, + nI_rS1[:-1] + (phi(cS1_s[1:] - S2mu)) / I_area_rS1, + (phi(c1_s - S2mu) - phi(cS1_s[(nRatings - 2)] - S2mu)) / I_area_rS1, + ] + ), + axis=0, + ), + ) + + # Get nC_rS2 probs - exact same pattern as original + nC_rS2 = (1 - phi(cS2_s - S2mu)) / C_area_rS2 + nC_rS2 = Deterministic( + f"nC_rS2_{s}", + pt.concatenate( + ( + [ + ((1 - phi(c1_s - S2mu)) - (1 - phi(cS2_s[0] - S2mu))) / C_area_rS2, + nC_rS2[:-1] - ((1 - phi(cS2_s[1:] - S2mu)) / C_area_rS2), + (1 - phi(cS2_s[nRatings - 2] - S2mu)) / C_area_rS2, + ] + ), + axis=0, + ), + ) + + # Avoid underflow of probabilities - exact same as original + nC_rS1 = pt.switch(nC_rS1 < data["Tol"], data["Tol"], nC_rS1) + nI_rS2 = pt.switch(nI_rS2 < data["Tol"], data["Tol"], nI_rS2) + nI_rS1 = pt.switch(nI_rS1 < data["Tol"], data["Tol"], nI_rS1) + nC_rS2 = pt.switch(nC_rS2 < data["Tol"], data["Tol"], nC_rS2) + + # TYPE 2 SDT MODEL Multinomial likelihood - exact same pattern as original + subject_counts = data["counts"][s, :] + + Multinomial( + f"CR_counts_{s}", + n=data["cr"][s], + p=nC_rS1, + shape=nRatings, + observed=subject_counts[:nRatings], + ) + Multinomial( + f"FA_counts_{s}", + n=data["falsealarms"][s], + p=nI_rS2, + shape=nRatings, + observed=subject_counts[nRatings : nRatings * 2], + ) + Multinomial( + f"M_counts_{s}", + n=data["m"][s], + p=nI_rS1, + shape=nRatings, + observed=subject_counts[nRatings * 2 : nRatings * 3], + ) + Multinomial( + f"H_counts_{s}", + n=data["hits"][s], + p=nC_rS2, + shape=nRatings, + observed=subject_counts[nRatings * 3 : nRatings * 4], + ) + + if sample_model is True: + trace = sample( + return_inferencedata=True, + chains=num_chains, + draws=num_samples, + **kwargs + ) + + return model, trace + + else: + return model \ No newline at end of file diff --git a/metadpy/tests/test_bayesian.py b/metadpy/tests/test_bayesian.py index 000a85b..50cc904 100644 --- a/metadpy/tests/test_bayesian.py +++ b/metadpy/tests/test_bayesian.py @@ -121,6 +121,64 @@ def test_hmetad(self): assert round(pymc_df["meta_d"].values[0], 2) - 1.58 < 0.01 assert round(pymc_df["m_ratio"].values[0], 2) - 1.03 < 0.01 + ################## + # Test group level + ################## + group_df = load_dataset("rm") + test_df = group_df[group_df['Subject'].isin([0, 1])] # Just 2 subjects for testing + + # Test model compilation only + model, _ = hmetad( + data=test_df, + subject='Subject', + nRatings=4, + stimuli='Stimuli', + accuracy='Accuracy', + confidence='Confidence', + sample_model=False + ) + assert isinstance(model, pm.Model) + + # Check that group-level variables exist + model_vars = list(model.named_vars.keys()) + assert 'mu_d1' in model_vars + assert 'mu_c1' in model_vars + assert 'mu_logMratio' in model_vars + assert 'sigma_d1' in model_vars + assert 'sigma_c1' in model_vars + assert 'sigma_logMratio' in model_vars + + # Check that subject-specific variables exist + assert 'c1' in model_vars # Should be shape (nSubj,) + assert 'd1' in model_vars # Should be shape (nSubj,) + assert 'meta_d' in model_vars # Should be shape (nSubj,) + + # Test with sampling (minimal for speed) + model, trace = hmetad( + data=test_df, + subject='Subject', + nRatings=4, + stimuli='Stimuli', + accuracy='Accuracy', + confidence='Confidence', + sample_model=True, + num_samples=10, # Very small for testing + num_chains=1, + tune=10 + ) + assert isinstance(model, pm.Model) + assert hasattr(trace, 'posterior') + + # Check that group parameters are in trace + group_vars = list(trace.posterior.data_vars.keys()) + assert 'mu_d1' in group_vars + assert 'mu_c1' in group_vars + assert 'mu_logMratio' in group_vars + + # Check dimensions + assert trace.posterior.mu_d1.dims == ('chain', 'draw') # Group-level scalar has chain, draw dims + assert trace.posterior.c1.dims == ('chain', 'draw', 'c1_dim_0') # Vector has additional subject dim + if __name__ == "__main__": unittest.main(argv=["first-arg-is-ignored"], exit=False)