diff --git a/patsy/build.py b/patsy/build.py index 470a83d..74b13aa 100644 --- a/patsy/build.py +++ b/patsy/build.py @@ -344,7 +344,7 @@ def test__subterm_column_names_iter_and__build_subterm(): mat3) assert np.allclose(mat3, 1) -def _factors_memorize(factors, data_iter_maker, eval_env): +def _factors_memorize(factors, data_iter_maker, eval_env, var_names): # First, start off the memorization process by setting up each factor's # state and finding out how many passes it will need: factor_states = {} @@ -362,7 +362,7 @@ def _factors_memorize(factors, data_iter_maker, eval_env): memorize_needed.add(factor) which_pass = 0 while memorize_needed: - for data in data_iter_maker(): + for data in safe_data_maker(data_iter_maker, var_names): for factor in memorize_needed: state = factor_states[factor] factor.memorize_chunk(state, which_pass, data) @@ -373,6 +373,18 @@ def _factors_memorize(factors, data_iter_maker, eval_env): which_pass += 1 return factor_states + +def safe_data_maker(data_iter_maker, var_names): + """Safely test if the `data_iter_maker` can accept var_names as a + parameter. + """ + var_names = list(var_names) + try: + return data_iter_maker(var_names) + except TypeError: + return data_iter_maker() + + def test__factors_memorize(): class MockFactor(object): def __init__(self, requested_passes, token): @@ -408,7 +420,7 @@ def __call__(self): f1 = MockFactor(1, "f1") f2a = MockFactor(2, "f2a") f2b = MockFactor(2, "f2b") - factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data, {}) + factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data, {}, []) assert data.calls == 2 mem_chunks0 = [("memorize_chunk", 0)] * data.CHUNKS mem_chunks1 = [("memorize_chunk", 1)] * data.CHUNKS @@ -434,11 +446,12 @@ def __call__(self): } assert factor_states == expected -def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): +def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action, + var_names): num_column_counts = {} cat_sniffers = {} examine_needed = set(factors) - for data in data_iter_maker(): + for data in safe_data_maker(data_iter_maker, var_names): for factor in list(examine_needed): value = factor.eval(factor_states[factor], data) if factor in cat_sniffers or guess_categorical(value): @@ -519,9 +532,10 @@ def next(self): } it = DataIterMaker() + var_names = [] (num_column_counts, cat_levels_contrasts, ) = _examine_factor_types(factor_states.keys(), factor_states, it, - NAAction()) + NAAction(), var_names) assert it.i == 2 iterations = 0 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} @@ -537,7 +551,7 @@ def next(self): no_read_necessary = [num_1dim, num_1col, num_4col, categ_1col, bool_1col] (num_column_counts, cat_levels_contrasts, ) = _examine_factor_types(no_read_necessary, factor_states, it, - NAAction()) + NAAction(), var_names) assert it.i == 0 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} assert cat_levels_contrasts == { @@ -562,7 +576,7 @@ def next(self): it = DataIterMaker() try: _examine_factor_types([illegal_factor], illegal_factor_states, it, - NAAction()) + NAAction(), var_names) except PatsyError as e: assert e.origin is illegal_factor.origin else: @@ -686,14 +700,18 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env, for termlist in termlists: for term in termlist: all_factors.update(term.factors) - factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) + var_names = {i for f in all_factors + for i in f.var_names(eval_env=eval_env)} + factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env, + var_names) # Now all the factors have working eval methods, so we can evaluate them # on some data to find out what type of data they return. (num_column_counts, cat_levels_contrasts) = _examine_factor_types(all_factors, factor_states, data_iter_maker, - NA_action) + NA_action, + var_names) # Now we need the factor infos, which encapsulate the knowledge of # how to turn any given factor into a chunk of data: factor_infos = {} diff --git a/patsy/desc.py b/patsy/desc.py index 8842b8b..40d0bbf 100644 --- a/patsy/desc.py +++ b/patsy/desc.py @@ -65,6 +65,30 @@ def name(self): else: return "Intercept" + def var_names(self, eval_env=0): + """Returns a set of variable names that are used in the :class:`Term`, + but not available in the current evalulation environment. These are + likely to be provided by data. + + :arg eval_env: Either a :class:`EvalEnvironment` which will be used to + look up any variables referenced in the :class:`Term` that cannot be + found in :class:`EvalEnvironment`, or else a depth represented as an + integer which will be passed to :meth:`EvalEnvironment.capture`. + ``eval_env=0`` means to use the context of the function calling + :meth:`var_names` for lookups. If calling this function from a + library, you probably want ``eval_env=1``, which means that variables + should be resolved in *your* caller's namespace. + + :returns: A set of strings of the potential variable names. + """ + if not eval_env: + eval_env = EvalEnvironment.capture(eval_env, reference=1) + if self.factors: + return {i for f in self.factors + for i in f.var_names(eval_env=eval_env)} + else: + return {} + __getstate__ = no_pickling INTERCEPT = Term([]) @@ -76,6 +100,9 @@ def __init__(self, name): def name(self): return self._name + def var_names(self, eval_env=0): + return {'{}_var'.format(self._name)} + def test_Term(): assert Term([1, 2, 1]).factors == (1, 2) assert Term([1, 2]) == Term([2, 1]) @@ -85,6 +112,9 @@ def test_Term(): assert Term([f1, f2]).name() == "a:b" assert Term([f2, f1]).name() == "b:a" assert Term([]).name() == "Intercept" + assert Term([f1]).var_names() == {'a_var'} + assert Term([f1, f2]).var_names() == {'a_var', 'b_var'} + assert Term([]).var_names() == {} assert_no_pickling(Term([])) @@ -148,7 +178,7 @@ def term_code(term): if term != INTERCEPT] result += " + ".join(term_names) return result - + @classmethod def from_formula(cls, tree_or_string): """Construct a :class:`ModelDesc` from a formula string. diff --git a/patsy/design_info.py b/patsy/design_info.py index 438a23c..69d460e 100644 --- a/patsy/design_info.py +++ b/patsy/design_info.py @@ -36,6 +36,7 @@ from patsy.constraint import linear_constraint from patsy.contrasts import ContrastMatrix from patsy.desc import ModelDesc, Term +from collections import OrderedDict class FactorInfo(object): """A FactorInfo object is a simple class that provides some metadata about @@ -659,6 +660,86 @@ def subset(self, which_terms): factor_infos=new_factor_infos, term_codings=new_term_codings) + def var_names(self, eval_env=0): + """Returns a set of variable names that are used in the + :class:`DesignInfo`, but not available in the current evalulation + environment. These are likely to be provided by data. + + :arg eval_env: Either a :class:`EvalEnvironment` which will be used to + look up any variables referenced in the :class:`DesignInfo` that + cannot be found in :class:`EvalEnvironment`, or else a depth + represented as an integer which will be passed to + :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the + context of the function calling :meth:`var_names` for lookups. + If calling this function from a library, you probably want + ``eval_env=1``, which means that variables should be resolved in + *your* caller's namespace. + + :returns: A set of strings of the potential variable names. + """ + if not eval_env: + from patsy.eval import EvalEnvironment + eval_env = EvalEnvironment.capture(eval_env, reference=1) + if self.terms: + return {i for t in self.terms for i in t.var_names(eval_env)} + else: + return {} + + def partial(self, columns, product=False, eval_env=0): + """Returns a partial prediction array where only the variables in the + dict ``columns`` are tranformed per the :class:`DesignInfo` + transformations. The terms that are not influenced by ``columns`` + return as zero. + + This is useful to perform a partial prediction on unseen data and to + view marginal differences in factors. + + :arg columns: A dict with the keys as the column names for the marginal + predictions desired and values as the marginal values to be predicted. + + :arg product: When `True`, the resturned numpy array represents the + Cartesian product of the values ``columns``. + + :returns: A numpy array of the partial design matrix. + """ + from .highlevel import dmatrix + from types import ModuleType + + if not eval_env: + from patsy.eval import EvalEnvironment + eval_env = EvalEnvironment.capture(eval_env, reference=1) + + # We need to get rid of the non-callable items from the eval_env + namespaces = [{key: value} for ns in eval_env._namespaces + for key, value in six.iteritems(ns) + if callable(value) or isinstance(value, ModuleType)] + eval_env._namespaces = namespaces + + if product: + columns = _column_product(columns) + rows = None + for col in columns: + if rows and rows != len(columns[col]): + raise ValueError('all columns must be of same length') + rows = len(columns[col]) + parts = [] + for term, subterm in six.iteritems(self.term_codings): + term_vars = term.var_names(eval_env) + present = True + for term_var in term_vars: + if term_var not in columns: + present = False + if present and (term.name() != 'Intercept'): + # This seems like an inelegent way to not having the Intercept + # in the output + di = self.subset('0 + {}'.format(term.name())) + parts.append(dmatrix(di, columns)) + else: + num_columns = sum(s.num_columns for s in subterm) + dm = np.zeros((rows, num_columns)) + parts.append(dm) + return np.hstack(parts) + @classmethod def from_array(cls, array_like, default_column_prefix="column"): """Find or construct a DesignInfo appropriate for a given array_like. @@ -693,14 +774,21 @@ def from_array(cls, array_like, default_column_prefix="column"): __getstate__ = no_pickling + def test_DesignInfo(): from nose.tools import assert_raises + from patsy.eval import EvalEnvironment + class _MockFactor(object): def __init__(self, name): self._name = name def name(self): return self._name + + def var_names(self, eval_env=0): + return {'{}_var'.format(self._name)} + f_x = _MockFactor("x") f_y = _MockFactor("y") t_x = Term([f_x]) @@ -735,6 +823,10 @@ def name(self): # smoke test repr(di) + assert di.var_names() == {'x_var', 'y_var'} + eval_env = EvalEnvironment.capture(0) + assert di.var_names(eval_env) == {'x_var', 'y_var'} + assert_no_pickling(di) # One without term objects @@ -756,6 +848,10 @@ def name(self): assert di.slice("a3") == slice(2, 3) assert di.slice("b") == slice(3, 4) + assert di.var_names() == {} + eval_env = EvalEnvironment.capture(0) + assert di.var_names(eval_env) == {} + # Check intercept handling in describe() assert DesignInfo(["Intercept", "a", "b"]).describe() == "1 + a + b" @@ -974,7 +1070,7 @@ def _format_float_column(precision, col): else: break return col_strs - + def test__format_float_column(): def t(precision, numbers, expected): got = _format_float_column(precision, np.asarray(numbers)) @@ -1099,7 +1195,7 @@ def max_width(col): + np.sum(column_widths)) print_numbers = (total_width < MAX_TOTAL_WIDTH) else: - print_numbers = False + print_numbers = False p.begin_group(INDENT, "DesignMatrix with shape %s" % (self.shape,)) p.breakable("\n" + " " * p.indentation) @@ -1197,3 +1293,78 @@ def test_design_matrix(): repr(DesignMatrix(np.zeros((1, 0)))) repr(DesignMatrix(np.zeros((0, 1)))) repr(DesignMatrix(np.zeros((0, 0)))) + + +def test_DesignInfo_partial(): + from .highlevel import dmatrix + from numpy.testing import assert_allclose + from patsy.eval import EvalEnvironment + eval_env = EvalEnvironment.capture(0) + a = np.array(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'a']) + b = np.array([1, 3, 2, 4, 1, 3, 1, 1]) + c = np.array([4, 3, 2, 1, 6, 4, 2, 1]) + dm = dmatrix('a + bs(b, df=3, degree=3) + np.log(c)') + x = np.zeros((3, 6)) + x[1, 1] = 1 + y = dm.design_info.partial({'a': ['a', 'b', 'a']}) + assert_allclose(x, y) + y = dm.design_info.partial({'a': ['a', 'b', 'a']}, eval_env=eval_env) + assert_allclose(x, y) + + x = np.zeros((2, 6)) + x[1, 1] = 1 + x[1, 5] = np.log(3) + p = OrderedDict([('a', ['a', 'b']), ('c', [1, 3])]) + y = dm.design_info.partial(p) + assert_allclose(x, y) + y = dm.design_info.partial(p, eval_env=eval_env) + assert_allclose(x, y) + + x = np.zeros((4, 6)) + x[2, 1] = 1 + x[3, 1] = 1 + x[1, 5] = np.log(3) + x[3, 5] = np.log(3) + y = dm.design_info.partial(p, product=True) + assert_allclose(x, y) + + dm = dmatrix('a * c') + y = dm.design_info.partial(p) + x = np.array([[0, 0, 1, 0], [0, 1, 3, 3]]) + assert_allclose(x, y) + + from nose.tools import assert_raises + assert_raises(ValueError, dm.design_info.partial, {'a': ['a', 'b'], + 'b': [1, 2, 3]}) + + def some_function(x): + return np.where(x > 2, 1, 2) + + dm = dmatrix('1 + some_function(c)') + x = np.array([[0, 2], + [0, 2], + [0, 1]]) + y = dm.design_info.partial({'c': np.array([1, 2, 3])}) + assert_allclose(x, y) + + +def _column_product(columns): + from itertools import product + cols = [] + values = [] + for col, value in six.iteritems(columns): + cols.append(col) + values.append(value) + values = [value for value in product(*values)] + values = [value for value in zip(*values)] + return OrderedDict([(col, list(value)) + for col, value in zip(cols, values)]) + + +def test_column_product(): + x = OrderedDict([('a', [1, 2, 3]), ('b', ['a', 'b'])]) + y = OrderedDict([('a', [1, 1, 2, 2, 3, 3]), + ('b', ['a', 'b', 'a', 'b', 'a', 'b'])]) + x = _column_product(x) + assert x['a'] == y['a'] + assert x['b'] == y['b'] diff --git a/patsy/eval.py b/patsy/eval.py index d4ed83f..bac2c65 100644 --- a/patsy/eval.py +++ b/patsy/eval.py @@ -448,6 +448,31 @@ def __init__(self, code, origin=None): self.code = normalize_token_spacing(code) self.origin = origin + def var_names(self, eval_env=0): + """Returns a set of variable names that are used in the + :class:`EvalFactor`, but not available in the current evalulation + environment. These are likely to be provided by data. + + :arg eval_env: Either a :class:`EvalEnvironment` which will be used to + look up any variables referenced in the :class:`EvalFactor` that + cannot be found in :class:`EvalEnvironment`, or else a depth + represented as an integer which will be passed to + :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the + context of the function calling :meth:`var_names` for lookups. + If calling this function from a library, you probably want + ``eval_env=1``, which means that variables should be resolved in + *your* caller's namespace. + + :returns: A set of strings of the potential variable names. + """ + if not eval_env: + eval_env = EvalEnvironment.capture(eval_env, reference=1) + eval_env = eval_env.with_outer_namespace(_builtins_dict) + env_namespace = eval_env.namespace + names = set(name for name in ast_names(self.code) + if name not in env_namespace) + return names + def name(self): return self.code @@ -691,6 +716,31 @@ def test_EvalFactor_end_to_end(): "y": np.array([10, 11, 100, 3])}) == [254, 256, 355, 236]) + +def test_EvalFactor_varnames(): + e = EvalFactor('a + b') + assert e.var_names() == {'a', 'b'} + from patsy.state import stateful_transform + + class bar(object): + pass + + foo = stateful_transform(lambda: "FOO-OBJ") + zed = stateful_transform(lambda: "ZED-OBJ") + bah = stateful_transform(lambda: "BAH-OBJ") + eval_env = EvalEnvironment.capture(0) + e = EvalFactor('foo(a) + bar.qux(b) + zed(bah(c))+ d') + state = {} + eval_env = EvalEnvironment.capture(0) + passes = e.memorize_passes_needed(state, eval_env) + print(passes) + print(state) + assert passes == 2 + for name in ["foo", "bah", "zed"]: + assert state["eval_env"].namespace[name] is locals()[name] + assert e.var_names(eval_env=eval_env) == {'a', 'b', 'c', 'd'} + + def annotated_tokens(code): prev_was_dot = False it = PushbackAdapter(python_tokenize(code)) diff --git a/patsy/test_build.py b/patsy/test_build.py index c843f9f..5a24c37 100644 --- a/patsy/test_build.py +++ b/patsy/test_build.py @@ -31,7 +31,7 @@ def assert_full_rank(m): u, s, v = np.linalg.svd(m) rank = np.sum(s > 1e-10) assert rank == m.shape[1] - + def test_assert_full_rank(): assert_full_rank(np.eye(10)) assert_full_rank([[1, 0], [1, 0], [1, 0], [1, 1]]) @@ -44,7 +44,7 @@ def test_assert_full_rank(): # col1 + col2 = col3 assert_raises(AssertionError, assert_full_rank, [[1, 2, 3], [1, 5, 6], [1, 6, 7]]) - + def make_termlist(*entries): terms = [] for entry in entries: @@ -116,11 +116,11 @@ def test_simple(): [1, 0, x1[1], 0], [0, 1, x1[2], x1[2]], [0, 1, x1[3], x1[3]]]) - + m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]], column_names=["x1", "x2", "x2:x1"]) assert np.allclose(m, np.column_stack((x1, x2, x1 * x2))) - + def test_R_bugs(): data = balanced(a=2, b=2, c=2) data["x"] = np.linspace(0, 1, len(data["a"])) @@ -253,7 +253,7 @@ def test_return_type(): def iter_maker(): yield data builder = design_matrix_builders([make_termlist("x")], iter_maker, 0)[0] - + # Check explicitly passing return_type="matrix" works mat = build_design_matrices([builder], data, return_type="matrix")[0] assert isinstance(mat, DesignMatrix) @@ -298,7 +298,7 @@ def iter_maker(): assert mat.shape == (2, 3) # According to this (and only this) function, NaN == NaN. np.testing.assert_array_equal(mat, [[1.0, 0.0, 10.0], [0.0, 1.0, np.nan]]) - + # NA_action="raise" assert_raises(PatsyError, build_design_matrices, @@ -596,7 +596,7 @@ def iter_maker(): def test_contrast(): from patsy.contrasts import ContrastMatrix, Sum values = ["a1", "a3", "a1", "a2"] - + # No intercept in model, full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [["a"]], column_names=["a[a1]", "a[a2]", "a[a3]"]) @@ -605,7 +605,7 @@ def test_contrast(): [0, 0, 1], [1, 0, 0], [0, 1, 0]]) - + for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [["a"]], column_names=["a[mean]", "a[S.a1]", "a[S.a2]"]) @@ -614,7 +614,7 @@ def test_contrast(): [1,-1, -1], [1, 1, 0], [1, 0, 1]]) - + m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [["a"]], column_names=["a[mean]", "a[S.a2]", "a[S.a3]"]) # Output from R @@ -631,7 +631,7 @@ def test_contrast(): [1, 0, 1], [1, 0, 0], [1, 1, 0]]) - + for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a1]", "a[S.a2]"]) @@ -640,7 +640,7 @@ def test_contrast(): [1,-1, -1], [1, 1, 0], [1, 0, 1]]) - + m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a2]", "a[S.a3]"]) # Output from R @@ -740,3 +740,31 @@ def t(which_terms, variables, columns): min_di_subset = min_di.subset(["c", "a"]) assert min_di_subset.column_names == ["c", "a"] assert min_di_subset.terms is None + + +def test_safe_data_maker(): + from patsy.build import safe_data_maker + if not have_pandas: + return + from pandas.util.testing import assert_frame_equal + data = pandas.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], + 'b': [4, 5, 6, 7, 8, 9, 1, 2, 3], + 'c': [7, 8, 9, 1, 2, 3, 4, 5, 6]}) + + def iter_maker(): + yield data.iloc[:4] + yield data.iloc[4:] + d = safe_data_maker(iter_maker, ['a', 'b']) + d2 = next(d) + assert_frame_equal(d2, data.iloc[:4]) + d2 = next(d) + assert_frame_equal(d2, data.iloc[4:]) + + def iter_maker(varnames): + yield data[varnames].iloc[:4] + yield data[varnames].iloc[4:] + d = safe_data_maker(iter_maker, ['a', 'b']) + d2 = next(d) + assert_frame_equal(d2, data[['a', 'b']].iloc[:4]) + d2 = next(d) + assert_frame_equal(d2, data[['a', 'b']].iloc[4:]) diff --git a/patsy/user_util.py b/patsy/user_util.py index b0aa7e8..bf8746e 100644 --- a/patsy/user_util.py +++ b/patsy/user_util.py @@ -183,6 +183,9 @@ def __init__(self, varname, def name(self): return self._varname + def var_names(self, eval_env=0): + return {'{}_var'.format(self._varname)} + def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self._varname) @@ -220,6 +223,7 @@ def eval(self, memorize_state, data): def test_LookupFactor(): l_a = LookupFactor("a") assert l_a.name() == "a" + assert l_a.var_names() == {'a_var'} assert l_a == LookupFactor("a") assert l_a != LookupFactor("b") assert hash(l_a) == hash(LookupFactor("a"))