Skip to content

Commit

Permalink
fix warnings reported in #39
Browse files Browse the repository at this point in the history
  • Loading branch information
paxcema committed Jul 4, 2023
1 parent 9d7a09d commit 34272d8
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
5 changes: 3 additions & 2 deletions type_infer/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty
unique_pct = nr_unique / len(data)

spaces = [len(str(x).split(' ')) - 1 for x in data]
mean_spaces = np.mean(spaces)
mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0

# Detect hash
all_same_length = all(len(str(data[0])) == len(str(x)) for x in data)
Expand All @@ -113,7 +113,8 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty
else:
randomness_per_index.append(S / np.log(N))

if np.mean(randomness_per_index) > 0.95:
mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0
if mean_randomness > 0.95:
return 'Hash-like identifier'

# Detect foreign key
Expand Down
6 changes: 4 additions & 2 deletions type_infer/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,12 @@ def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame,
if all(isinstance(x, str) for x in data):
can_be_tags = True

mean_lenghts = np.mean(lengths) if len(lenghts) > 0 else 0

# If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa
if (can_be_tags and np.mean(lengths) > 1.3 and
if (can_be_tags and mean_lenghts > 1.3 and
6 <= len(unique_tokens) <= 30 and
len(unique_tokens) / np.mean(lengths) < (len(data) / 4)):
len(unique_tokens) / mean_lenghts < (len(data) / 4)):
curr_dtype = dtype.tags

# Categorical based on unique values
Expand Down

0 comments on commit 34272d8

Please sign in to comment.