Skip to content

Commit

Permalink
Merge pull request #736 from mindsdb/drop_no_inf_cols
Browse files Browse the repository at this point in the history
Drop column with no information
  • Loading branch information
George3d6 authored Nov 11, 2021
2 parents 8b1aa85 + 93eb330 commit cc3b08c
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions lightwood/helpers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import json
import re
import hashlib
from typing import Iterable
import numpy as np
import scipy.stats as st
import langdetect
Expand Down Expand Up @@ -208,17 +209,22 @@ def get_identifier_description_mp(arg_tup):
return get_identifier_description(data, column_name, data_dtype)


def get_identifier_description(data, column_name, data_dtype):
def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype):
data = list(data)
unquie_pct = len(set(data)) / len(data)
nr_unique = len(set(data))

if nr_unique == 1:
return 'No Information'

unique_pct = nr_unique / len(data)

spaces = [len(str(x).split(' ')) - 1 for x in data]
mean_spaces = np.mean(spaces)

# Detect auto incrementing index
# -- some cases where I guess people do want to use this for learning, so ignoring this check for now...
# if data_dtype == dtype.integer:
# if get_pct_auto_increment(data) > 0.98 and unquie_pct > 0.99:
# if get_pct_auto_increment(data) > 0.98 and unique_pct > 0.99:
# return 'Auto-incrementing identifier'

# Detect hash
Expand All @@ -244,15 +250,15 @@ def get_identifier_description(data, column_name, data_dtype):
return 'Foreign key'

if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary):
if unquie_pct > 0.98:
if unique_pct > 0.98:
if is_uuid:
return 'UUID'
else:
return 'Unknown identifier'

# Everything is unique and it's too short to be rich text
if data_dtype in (dtype.categorical, dtype.short_text, dtype.rich_text) and \
unquie_pct > 0.99999 and mean_spaces < 1:
unique_pct > 0.99999 and mean_spaces < 1:
return 'Unknown identifier'

return None
Expand Down

0 comments on commit cc3b08c

Please sign in to comment.