From 64fec8ccf99b53dff3b76eb93e26dd33fe29990e Mon Sep 17 00:00:00 2001 From: george Date: Mon, 8 Nov 2021 13:15:56 -0500 Subject: [PATCH 1/3] fix: dropping columns with no info --- lightwood/helpers/text.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lightwood/helpers/text.py b/lightwood/helpers/text.py index 992db70d5..e64dcde48 100644 --- a/lightwood/helpers/text.py +++ b/lightwood/helpers/text.py @@ -210,7 +210,12 @@ def get_identifier_description_mp(arg_tup): def get_identifier_description(data, column_name, data_dtype): data = list(data) - unquie_pct = len(set(data)) / len(data) + nr_unique = len(set(data)) + + if nr_unique == 1: + return 'No Information' + + unquie_pct = nr_unique / len(data) spaces = [len(str(x).split(' ')) - 1 for x in data] mean_spaces = np.mean(spaces) From fda35569d162e27dbe5039a879316720b09cff12 Mon Sep 17 00:00:00 2001 From: george Date: Wed, 10 Nov 2021 21:36:45 -0500 Subject: [PATCH 2/3] fix: spelling --- lightwood/helpers/text.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightwood/helpers/text.py b/lightwood/helpers/text.py index e64dcde48..13f7526f9 100644 --- a/lightwood/helpers/text.py +++ b/lightwood/helpers/text.py @@ -215,7 +215,7 @@ def get_identifier_description(data, column_name, data_dtype): if nr_unique == 1: return 'No Information' - unquie_pct = nr_unique / len(data) + unique_pct = nr_unique / len(data) spaces = [len(str(x).split(' ')) - 1 for x in data] mean_spaces = np.mean(spaces) @@ -223,7 +223,7 @@ def get_identifier_description(data, column_name, data_dtype): # Detect auto incrementing index # -- some cases where I guess people do want to use this for learning, so ignoring this check for now... # if data_dtype == dtype.integer: - # if get_pct_auto_increment(data) > 0.98 and unquie_pct > 0.99: + # if get_pct_auto_increment(data) > 0.98 and unique_pct > 0.99: # return 'Auto-incrementing identifier' # Detect hash @@ -249,7 +249,7 @@ def get_identifier_description(data, column_name, data_dtype): return 'Foreign key' if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): - if unquie_pct > 0.98: + if unique_pct > 0.98: if is_uuid: return 'UUID' else: @@ -257,7 +257,7 @@ def get_identifier_description(data, column_name, data_dtype): # Everything is unique and it's too short to be rich text if data_dtype in (dtype.categorical, dtype.short_text, dtype.rich_text) and \ - unquie_pct > 0.99999 and mean_spaces < 1: + unique_pct > 0.99999 and mean_spaces < 1: return 'Unknown identifier' return None From 93eb330b557e5aae0845f04efdc48cc9d8082683 Mon Sep 17 00:00:00 2001 From: george Date: Wed, 10 Nov 2021 21:37:31 -0500 Subject: [PATCH 3/3] feat: type hinting --- lightwood/helpers/text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lightwood/helpers/text.py b/lightwood/helpers/text.py index 13f7526f9..f933f8582 100644 --- a/lightwood/helpers/text.py +++ b/lightwood/helpers/text.py @@ -13,6 +13,7 @@ import json import re import hashlib +from typing import Iterable import numpy as np import scipy.stats as st import langdetect @@ -208,7 +209,7 @@ def get_identifier_description_mp(arg_tup): return get_identifier_description(data, column_name, data_dtype) -def get_identifier_description(data, column_name, data_dtype): +def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype): data = list(data) nr_unique = len(set(data))