From 34272d8a090ce2b14d62f2c20c75bd5e016f086a Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 3 Jul 2023 21:52:28 -0400 Subject: [PATCH] fix warnings reported in #39 --- type_infer/helpers.py | 5 +++-- type_infer/infer.py | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/type_infer/helpers.py b/type_infer/helpers.py index 93d81b4..3ae6bc1 100644 --- a/type_infer/helpers.py +++ b/type_infer/helpers.py @@ -94,7 +94,7 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty unique_pct = nr_unique / len(data) spaces = [len(str(x).split(' ')) - 1 for x in data] - mean_spaces = np.mean(spaces) + mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0 # Detect hash all_same_length = all(len(str(data[0])) == len(str(x)) for x in data) @@ -113,7 +113,8 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty else: randomness_per_index.append(S / np.log(N)) - if np.mean(randomness_per_index) > 0.95: + mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0 + if mean_randomness > 0.95: return 'Hash-like identifier' # Detect foreign key diff --git a/type_infer/infer.py b/type_infer/infer.py index 42912f6..8ffb57f 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -233,10 +233,12 @@ def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame, if all(isinstance(x, str) for x in data): can_be_tags = True + mean_lenghts = np.mean(lengths) if len(lenghts) > 0 else 0 + # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa - if (can_be_tags and np.mean(lengths) > 1.3 and + if (can_be_tags and mean_lenghts > 1.3 and 6 <= len(unique_tokens) <= 30 and - len(unique_tokens) / np.mean(lengths) < (len(data) / 4)): + len(unique_tokens) / mean_lenghts < (len(data) / 4)): curr_dtype = dtype.tags # Categorical based on unique values