From 32e517d9dfdbcb270549f6ea223f677e2082d07f Mon Sep 17 00:00:00 2001 From: Tom Kingsford Date: Wed, 28 Jun 2023 19:10:29 +1200 Subject: [PATCH 1/2] fix: make vstack correctly handle stacking dataframes and series --- modAL/expected_error.py | 2 +- modAL/utils/data.py | 43 ++++++++++++++++++++++++++++++----------- rtd_requirements.txt | 3 ++- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/modAL/expected_error.py b/modAL/expected_error.py index d7b3611..848ae7b 100644 --- a/modAL/expected_error.py +++ b/modAL/expected_error.py @@ -16,7 +16,7 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', - p_subsample: np.float = 1.0, n_instances: int = 1, + p_subsample: float = 1.0, n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray: """ Expected error reduction query strategy. diff --git a/modAL/utils/data.py b/modAL/utils/data.py index 3e707ff..c75350e 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -6,7 +6,7 @@ try: import torch -except: +except ImportError: pass @@ -23,22 +23,37 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput: Returns: New sequence of vertically stacked elements. """ + + if not blocks: + return blocks + + types = {type(block) for block in blocks} + if any([sp.issparse(b) for b in blocks]): return sp.vstack(blocks) - elif isinstance(blocks[0], pd.DataFrame): - return blocks[0].append(blocks[1:]) - elif isinstance(blocks[0], np.ndarray): + elif types - {pd.DataFrame, pd.Series} == set(): + def _block_to_df(block): + if isinstance(block, pd.DataFrame): + return block + elif isinstance(block, pd.Series): + # interpret series as a row + return block.to_frame().T + else: + raise TypeError(f"Expected DataFrame or Series but encountered {type(block)}") + + return pd.concat([_block_to_df(block) for block in blocks]) + elif types == {np.ndarray}: return np.concatenate(blocks) - elif isinstance(blocks[0], list): + elif types == {list}: return np.concatenate(blocks).tolist() try: - if torch.is_tensor(blocks[0]): + if all(torch.is_tensor(block) for block in blocks): return torch.cat(blocks) except: pass - raise TypeError("%s datatype is not supported" % type(blocks[0])) + raise TypeError("%s datatype(s) not supported" % types) def data_hstack(blocks: Sequence[modALinput]) -> modALinput: @@ -51,13 +66,19 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: Returns: New sequence of horizontally stacked elements. """ + + if not blocks: + return blocks + + types = {type(block) for block in blocks} + if any([sp.issparse(b) for b in blocks]): return sp.hstack(blocks) - elif isinstance(blocks[0], pd.DataFrame): + elif types == {pd.DataFrame}: pd.concat(blocks, axis=1) - elif isinstance(blocks[0], np.ndarray): + elif types == {np.ndarray}: return np.hstack(blocks) - elif isinstance(blocks[0], list): + elif types == {list}: return np.hstack(blocks).tolist() try: @@ -66,7 +87,7 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: except: pass - TypeError("%s datatype is not supported" % type(blocks[0])) + raise TypeError("%s datatype(s) not supported" % types) def add_row(X: modALinput, row: modALinput): diff --git a/rtd_requirements.txt b/rtd_requirements.txt index db0bd81..8108132 100644 --- a/rtd_requirements.txt +++ b/rtd_requirements.txt @@ -1,7 +1,8 @@ -numpy==1.20.0 +numpy scipy scikit-learn ipykernel nbsphinx pandas skorch +torch \ No newline at end of file From 57b9154bceb4e78aa6e2bbcde65e5970ef53a9b3 Mon Sep 17 00:00:00 2001 From: Tom Kingsford Date: Wed, 28 Jun 2023 19:39:04 +1200 Subject: [PATCH 2/2] fix: remove unsafely broad except clauses --- modAL/models/base.py | 26 ++++++++--------- modAL/utils/data.py | 67 ++++++++++++++++++-------------------------- tests/core_tests.py | 17 ++++------- 3 files changed, 45 insertions(+), 65 deletions(-) diff --git a/modAL/models/base.py b/modAL/models/base.py index 57c8b81..703d1ef 100644 --- a/modAL/models/base.py +++ b/modAL/models/base.py @@ -171,14 +171,13 @@ def query(self, X_pool, *query_args, return_metrics: bool = False, **query_kwarg query_metrics: returns also the corresponding metrics, if return_metrics == True """ - try: - query_result, query_metrics = self.query_strategy( - self, X_pool, *query_args, **query_kwargs) - - except: + _query_strategy_result = self.query_strategy( + self, X_pool, *query_args, **query_kwargs) + if isinstance(_query_strategy_result, tuple) and len(_query_strategy_result) == 2: + query_result, query_metrics = _query_strategy_result + else: + query_result = _query_strategy_result query_metrics = None - query_result = self.query_strategy( - self, X_pool, *query_args, **query_kwargs) if return_metrics: if query_metrics is None: @@ -313,14 +312,13 @@ def query(self, X_pool, return_metrics: bool = False, *query_args, **query_kwarg query_metrics: returns also the corresponding metrics, if return_metrics == True """ - try: - query_result, query_metrics = self.query_strategy( - self, X_pool, *query_args, **query_kwargs) - - except: + _query_strategy_result = self.query_strategy( + self, X_pool, *query_args, **query_kwargs) + if isinstance(_query_strategy_result, tuple) and len(_query_strategy_result) == 2: + query_result, query_metrics = _query_strategy_result + else: + query_result = _query_strategy_result query_metrics = None - query_result = self.query_strategy( - self, X_pool, *query_args, **query_kwargs) if return_metrics: if query_metrics is None: diff --git a/modAL/utils/data.py b/modAL/utils/data.py index c75350e..053e1e7 100644 --- a/modAL/utils/data.py +++ b/modAL/utils/data.py @@ -1,3 +1,4 @@ +import sys from typing import List, Sequence, Union import numpy as np @@ -47,11 +48,8 @@ def _block_to_df(block): elif types == {list}: return np.concatenate(blocks).tolist() - try: - if all(torch.is_tensor(block) for block in blocks): - return torch.cat(blocks) - except: - pass + if 'torch' in sys.modules and all(torch.is_tensor(block) for block in blocks): + return torch.cat(blocks) raise TypeError("%s datatype(s) not supported" % types) @@ -81,11 +79,8 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput: elif types == {list}: return np.hstack(blocks).tolist() - try: - if torch.is_tensor(blocks[0]): - return torch.cat(blocks, dim=1) - except: - pass + if 'torch' in sys.modules and torch.is_tensor(blocks[0]): + return torch.cat(blocks, dim=1) raise TypeError("%s datatype(s) not supported" % types) @@ -121,24 +116,26 @@ def retrieve_rows( try: return X[I] - except: - if sp.issparse(X): - # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix, - # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix - # and sp.dia_matrix don't support indexing and need to be converted to a sparse format - # that does support indexing. It seems conversion to CSR is currently most efficient. - - sp_format = X.getformat() - return X.tocsr()[I].asformat(sp_format) - elif isinstance(X, pd.DataFrame): - return X.iloc[I] - elif isinstance(X, list): - return np.array(X)[I].tolist() - elif isinstance(X, dict): - X_return = {} - for key, value in X.items(): - X_return[key] = retrieve_rows(value, I) - return X_return + except (KeyError, IndexError, TypeError): + pass + + if sp.issparse(X): + # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix, + # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix + # and sp.dia_matrix don't support indexing and need to be converted to a sparse format + # that does support indexing. It seems conversion to CSR is currently most efficient. + + sp_format = X.getformat() + return X.tocsr()[I].asformat(sp_format) + elif isinstance(X, pd.DataFrame): + return X.iloc[I] + elif isinstance(X, list): + return np.array(X)[I].tolist() + elif isinstance(X, dict): + X_return = {} + for key, value in X.items(): + X_return[key] = retrieve_rows(value, I) + return X_return raise TypeError("%s datatype is not supported" % type(X)) @@ -160,12 +157,6 @@ def drop_rows( elif isinstance(X, list): return np.delete(X, I, axis=0).tolist() - try: - if torch.is_tensor(blocks[0]): - return torch.cat(blocks) - except: - X[[True if row not in I else False for row in range(X.size(0))]] - raise TypeError("%s datatype is not supported" % type(X)) @@ -194,11 +185,9 @@ def data_shape(X: modALinput): """ Returns the shape of the data set X """ - try: - # scipy.sparse, torch, pandas and numpy all support .shape + if isinstance(X, list): + return np.array(X).shape + elif hasattr(X, "shape"): # scipy.sparse, torch, pandas and numpy all support .shape return X.shape - except: - if isinstance(X, list): - return np.array(X).shape raise TypeError("%s datatype is not supported" % type(X)) diff --git a/tests/core_tests.py b/tests/core_tests.py index e3113c4..b2d67f8 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -115,11 +115,8 @@ def dummy_function(X_in): else: true_result = n_functions*np.ones(shape=(n_samples, 1)) - try: - np.testing.assert_almost_equal( - linear_combination(X_in), true_result) - except: - linear_combination(X_in) + np.testing.assert_almost_equal( + linear_combination(X_in), true_result) def test_product(self): for n_dim in range(1, 5): @@ -476,15 +473,11 @@ def test_KL_max_disagreement(self): true_KL_disagreement = np.zeros(shape=(n_samples, )) - try: - np.testing.assert_array_almost_equal( - true_KL_disagreement, - modAL.disagreement.KL_max_disagreement( - committee, np.random.rand(n_samples, 1)) - ) - except: + np.testing.assert_array_almost_equal( + true_KL_disagreement, modAL.disagreement.KL_max_disagreement( committee, np.random.rand(n_samples, 1)) + ) # 2. unfitted committee committee = mock.MockCommittee(fitted=False)