Merge branch 'release/v0.5.0'

aai-institute · Feb 20, 2023 · e1d28ef · e1d28ef
2 parents 3dedb5a + 52a6e61
commit e1d28ef
Show file tree

Hide file tree

Showing 63 changed files with 3,758 additions and 1,888 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0
+current_version = 0.5.0
 commit = False
 tag = False
 allow_dirty = False

diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -30,7 +30,7 @@ jobs:
         id: get_branch_name
         if: github.ref_type == 'tag'
         run: |
-          export BRANCH_NAME=$(git log -1 --format='%D' $GITHUB_REF | | sed -e 's/.*origin\/\(.*\),.*/\1/')
+          export BRANCH_NAME=$(git log -1 --format='%D' $GITHUB_REF | sed -e 's/.*origin\/\(.*\),.*/\1/')
           echo ::set-output name=branch_name::${BRANCH_NAME}
         shell: bash
       - name: Fail if tag is not on 'master' branch

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
       - id: black-jupyter
         language_version: python3
   - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
+    rev: 5.12.0
     hooks:
       - id: isort
   - repo: https://github.com/kynan/nbstripout

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,33 @@
 # Changelog
 
+## 0.5.0 - 💥 Fixes, nicer interfaces and... more breaking changes 😒
+
+- Fixed parallel and antithetic Owen sampling for Shapley values. Simplified
+  and extended tests.
+  [PR #267](https://github.com/appliedAI-Initiative/pyDVL/pull/267)
+- Added `Scorer` class for a cleaner interface. Fixed minor bugs around
+  Group-Testing Shapley, added more tests and switched to cvxpy for the solver.
+  [PR #264](https://github.com/appliedAI-Initiative/pyDVL/pull/264)
+- Generalised stopping criteria for valuation algorithms. Improved classes
+  `ValuationResult` and `Status` with more operations. Some minor issues fixed.
+  [PR #252](https://github.com/appliedAI-Initiative/pyDVL/pull/250)
+- Fixed a bug whereby `compute_shapley_values` would only spawn one process when
+  using `n_jobs=-1` and Monte Carlo methods.
+  [PR #270](https://github.com/appliedAI-Initiative/pyDVL/pull/270)
+- Bugfix in `RayParallelBackend`: wrong semantics for `kwargs`.
+  [PR #268](https://github.com/appliedAI-Initiative/pyDVL/pull/268)
+- Splitting of problem preparation and solution in Least-Core computation.
+  Umbrella function for LC methods.
+  [PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257) 
+- Operations on `ValuationResult` and `Status` and some cleanup
+  [PR #248](https://github.com/appliedAI-Initiative/pyDVL/pull/248)
+- **Bug fix and minor improvements**: Fixes bug in TMCS with remote Ray cluster,
+  raises an error for dummy sequential parallel backend with TMCS, clones model
+  inside `Utility` before fitting by default, with flag `clone_before_fit` 
+  to disable it, catches all warnings in `Utility` when `show_warnings` is 
+  `False`. Adds Miner and Gloves toy games utilities
+  [PR #247](https://github.com/appliedAI-Initiative/pyDVL/pull/247)
+
 ## 0.4.0 - 🏭💥 New algorithms and more breaking changes
 
 - GH action to mark issues as stale
@@ -11,8 +39,8 @@
 - **Breaking change:** Introduces a class ValuationResult to gather and inspect
   results from all valuation algorithms
   [PR #214](https://github.com/appliedAI-Initiative/pyDVL/pull/214)
-- Fixes bug in Influence calculation with multi-dimensional input and adds
-  new example notebook
+- Fixes bug in Influence calculation with multidimensional input and adds new
+  example notebook
   [PR #195](https://github.com/appliedAI-Initiative/pyDVL/pull/195)
 - **Breaking change**: Passes the input to `MapReduceJob` at initialization,
   removes `chunkify_inputs` argument from `MapReduceJob`, removes `n_runs`

diff --git a/README.md b/README.md
@@ -99,7 +99,7 @@ Data Shapley values:
 ```python
 import numpy as np
 from pydvl.utils import Dataset, Utility
-from pydvl.value.shapley import compute_shapley_values
+from pydvl.value import *
 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
 
@@ -111,7 +111,7 @@ dataset = Dataset(X_train, y_train, X_test, y_test)
 model = LinearRegression()
 utility = Utility(model, dataset)
 values = compute_shapley_values(
-    u=utility, n_iterations=100, mode="truncated_montecarlo"
+    u=utility, mode="truncated_montecarlo", done=MaxUpdates(100)
 )
 ```
 

diff --git a/build_scripts/update_docs.py b/build_scripts/update_docs.py
@@ -24,9 +24,6 @@ def module_template(module_qualname: str):
    :undoc-members:
    
    ----
-   
-   Module members
-   ==============
  
 .. footbibliography::
 

diff --git a/docs/30-data-valuation.rst b/docs/30-data-valuation.rst
@@ -118,6 +118,34 @@ is implemented, it is important not to reuse `Utility` objects for different
 datasets. You can read more about :ref:`caching setup` in the installation guide
 and the documentation of the :mod:`pydvl.utils.caching` module.
 
+Using custom scorers
+^^^^^^^^^^^^^^^^^^^^
+
+The `scoring` argument of :class:`~pydvl.utils.utility.Utility` can be used to
+specify a custom :class:`~pydvl.utils.utility.Scorer` object. This is a simple
+wrapper for a callable that takes a model, and test data and returns a score.
+
+More importantly, the object provides information about the range of the score,
+which is used by some methods by estimate the number of samples necessary, and
+about what default value to use when the model fails to train.
+
+.. note::
+   The most important property of a `Scorer` is its default value. Because many
+   models will fail to fit on small subsets of the data, it is important to
+   provide a sensible default value for the score.
+
+It is possible to skip the construction of the :class:`~pydvl.utils.utility.Scorer`
+when constructing the `Utility` object. The two following calls are equivalent:
+
+.. code-block:: python
+
+   utility = Utility(
+       model, dataset, "explained_variance", score_range=(-np.inf, 1), default_score=0.0
+   )
+   utility = Utility(
+       model, dataset, Scorer("explained_variance", range=(-np.inf, 1), default=0.0)
+   )
+
 Learning the utility
 ^^^^^^^^^^^^^^^^^^^^
 
@@ -174,7 +202,7 @@ definitions, but other methods are typically preferable.
    values = naive_loo(utility)
 
 The return value of all valuation functions is an object of type
-:class:`~pydvl.value.results.ValuationResult`. This can be iterated over,
+:class:`~pydvl.value.result.ValuationResult`. This can be iterated over,
 indexed with integers, slices and Iterables, as well as converted to a
 `pandas DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_.
 
@@ -217,11 +245,11 @@ v_u(x_i) = \frac{1}{n} \sum_{S \subseteq D \setminus \{x_i\}}
    values = compute_shapley_values(utility, mode="combinatorial_exact")
    df = values.to_dataframe(column='value')
 
-We convert the return value to a
+We can convert the return value to a
 `pandas DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_
 and name the column with the results as `value`. Please refer to the
 documentation in :mod:`pydvl.value.shapley` and
-:class:`~pydvl.value.results.ValuationResult` for more information.
+:class:`~pydvl.value.result.ValuationResult` for more information.
 
 Monte Carlo Combinatorial Shapley
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -240,12 +268,19 @@ same pattern:
    model = ...
    data = Dataset(...)
    utility = Utility(model, data)
-   values = compute_shapley_values(utility, mode="combinatorial_montecarlo")
+   values = compute_shapley_values(
+       utility, mode="combinatorial_montecarlo", done=MaxUpdates(1000)
+   )
    df = values.to_dataframe(column='cmc')
 
 The DataFrames returned by most Monte Carlo methods will contain approximate
 standard errors as an additional column, in this case named `cmc_stderr`.
 
+Note the usage of the object :class:`~pydvl.value.stopping.MaxUpdates` as the
+stop condition. This is an instance of a
+:class:`~pydvl.value.stopping.StoppingCriterion`. Other examples are
+:class:`~pydvl.value.stopping.MaxTime` and :class:`~pydvl.value.stopping.StandardError`.
+
 
 Owen sampling
 ^^^^^^^^^^^^^
@@ -281,6 +316,10 @@ sampling, and its variant *Antithetic Owen Sampling* in the documentation for th
 function doing the work behind the scenes:
 :func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`.
 
+Note that in this case we do not pass a
+:class:`~pydvl.value.stopping.StoppingCriterion` to the function, but instead
+the number of iterations and the maximum number of samples to use in the
+integration.
 
 Permutation Shapley
 ^^^^^^^^^^^^^^^^^^^
@@ -309,7 +348,7 @@ efficient enough to be useful in some applications.
    data = Dataset(...)
    utility = Utility(model, data)
    values = compute_shapley_values(
-       u=utility, mode="truncated_montecarlo", n_iterations=100
+       u=utility, mode="truncated_montecarlo", done=MaxUpdates(1000)
    )
 
 
@@ -358,14 +397,15 @@ $$
    but we don't advocate its use because of the speed and memory cost. Despite
    our best efforts, the number of samples required in practice for convergence
    can be several orders of magnitude worse than with e.g. Truncated Monte Carlo.
+   Additionally, the CSP can sometimes turn out to be infeasible.
 
 Usage follows the same pattern as every other Shapley method, but with the
-addition of an ``eps`` parameter required for the solution of the CSP. It should
-be the same value used to compute the minimum number of samples required. This
-can be done with :func:`~pydvl.value.shapley.gt.num_samples_eps_delta`, but note
-that the number returned will be huge! In practice, fewer samples can be enough,
-but the actual number will strongly depend on the utility, in particular its
-variance.
+addition of an ``epsilon`` parameter required for the solution of the CSP. It
+should be the same value used to compute the minimum number of samples required.
+This can be done with :func:`~pydvl.value.shapley.gt.num_samples_eps_delta`, but
+note that the number returned will be huge! In practice, fewer samples can be
+enough, but the actual number will strongly depend on the utility, in particular
+its variance.
 
 .. code-block:: python
 
@@ -459,29 +499,18 @@ Monte Carlo Least Core
 Because the number of subsets $S \subseteq D \setminus \{x_i\}$ is
 $2^{ | D | - 1 }$, one typically must resort to approximations.
 
-The simplest approximation consists of two relaxations of the Least Core
-(:footcite:t:`yan_if_2021`):
-
-- Further relaxing the coalitional rationality property by
-  a constant value $\epsilon > 0$:
-
-  $$
-  \sum_{x_i\in S} v_u(x_i) + e + \epsilon \geq u(S)
-  $$
-
-- Using a fraction of all subsets instead of all possible subsets.
-
-Combined, this gives us the $(\epsilon, \delta)$-*probably approx-
-imate least core* that satisfies the following property:
+The simplest approximation consists in using a fraction of all subsets for the
+constraints. :footcite:t:`yan_if_2021` show that a quantity of order
+$\mathcal{O}((n - \log \Delta ) / \delta^2)$ is enough to obtain a so-called
+$\delta$-*approximate least core* with high probability. I.e. the following
+property holds with probability $1-\Delta$ over the choice of subsets:
 
 $$
-P_{S\sim D}\left[\sum_{x_i\in S} v_u(x_i) + e^{*} + \epsilon \geq u(S)\right]
-\geq 1 - \delta
+\mathbb{P}_{S\sim D}\left[\sum_{x_i\in S} v_u(x_i) + e^{*} \geq u(S)\right]
+\geq 1 - \delta,
 $$
 
-Where $e^{*}$ is the optimal least core subsidy.
-
-With these relaxations, we obtain a polynomial running time.
+where $e^{*}$ is the optimal least core subsidy.
 
 .. code-block:: python
 
@@ -497,6 +526,28 @@ With these relaxations, we obtain a polynomial running time.
 
    ``n_iterations`` needs to be at least equal to the number of data points.
 
+Because computing the Least Core values requires the solution of a linear and a
+quadratic problem *after* computing all the utility values, we offer the
+possibility of splitting the latter from the former. This is useful when running
+multiple experiments: use
+:func:`~pydvl.value.least_core.montecarlo.mclc_prepare_problem` to prepare a
+list of problems to solve, then solve them in parallel with
+:func:`~pydvl.value.least_core.common.lc_solve_problems`.
+
+.. code-block:: python
+
+   from pydvl.utils import Dataset, Utility
+   from pydvl.value.least_core import mclc_prepare_problem, lc_solve_problems
+   model = ...
+   dataset = Dataset(...)
+   n_iterations = ...
+   utility = Utility(data, model)
+   n_experiments = 10
+   problems = [mclc_prepare_problem(utility, n_iterations=n_iterations)
+        for _ in range(n_experiments)]
+   values = lc_solve_problems(problems)
+
+
 Other methods
 =============
 
@@ -528,7 +579,7 @@ nature of every (non-trivial) ML problem can have an effect:
 
   pyDVL offers a dedicated :func:`function composition
   <pydvl.utils.types.compose_score>` for scorer functions which can be used to
-  squash a score. The following is defined in module :mod:`~pydvl.utils.numeric`:
+  squash a score. The following is defined in module :mod:`~pydvl.utils.scorer`:
 
   .. code-block:: python
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,13 +1,14 @@
 black[jupyter] == 22.10.0
-isort == 5.10.1
+isort == 5.12.0
 jupyter
 mypy == 0.982
-nbconvert
+nbconvert>=7.2.9
 nbstripout == 0.6.1
 bump2version
-pre-commit == 2.20.0
-pytest
+pre-commit==3.0.4
+pytest==7.2.1
 pytest-cov
+pytest-docker==0.12.0
 pytest-mock
 pytest-timeout
 ray[default] >= 0.8

diff --git a/requirements-notebooks.txt b/requirements-notebooks.txt
@@ -1,4 +1,4 @@
 torch==1.13.1
 torchvision==0.14.1
 datasets==2.6.1
-Pillow==9.2.0
+pillow==9.3.0
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
     package_dir={"": "src"},
     packages=find_packages(where="src"),
     include_package_data=True,
-    version="0.4.0",
+    version="0.5.0",
     description="The Python Data Valuation Library",
     install_requires=[
         line
@@ -20,9 +20,7 @@
     ],
     setup_requires=["wheel"],
     tests_require=["pytest"],
-    extras_require={
-        "influence": ["torch"],
-    },
+    extras_require={"influence": ["torch"]},
     author="appliedAI Institute gGmbH",
     long_description=long_description,
     long_description_content_type="text/markdown",
@@ -41,8 +39,8 @@
         "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
     ],
     project_urls={
-        "Source": "https://appliedAI-Initiative/pydvl",
-        "Documentation": "https://appliedai-initiative.github.io/pyDVL/",
+        "Source": "https://github.com/appliedAI-Initiative/pydvl",
+        "Documentation": "https://appliedai-initiative.github.io/pyDVL",
         "TransferLab": "https://transferlab.appliedai.de",
     },
 )
diff --git a/src/pydvl/__init__.py b/src/pydvl/__init__.py
@@ -1 +1 @@
-__version__ = "0.4.0"
+__version__ = "0.5.0"