Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
paxcema committed Dec 22, 2023
1 parent 4b495c4 commit c415465
Show file tree
Hide file tree
Showing 12 changed files with 37 additions and 31 deletions.
2 changes: 1 addition & 1 deletion tests/unit_tests/rule_based/test_infer_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_negative_integers(self):
self.assertEqual(dtyp, dtype.integer)

def test_negative_floats(self):
data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col'])
data = pd.DataFrame([float(-random.randint(-10, 10)) for _ in range(100)] + [0.1], columns=['test_col'])
engine = RuleBasedEngine()
dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
self.assertEqual(dtyp, dtype.float)
12 changes: 12 additions & 0 deletions tests/unit_tests/rule_based/test_misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import unittest

from type_infer.rule_based.helpers import tokenize_text


class TestDates(unittest.TestCase):
def test_get_tokens(self):
sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
for sent in sentences:
assert list(tokenize_text(sent)) == ['hello', 'world']

assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
8 changes: 0 additions & 8 deletions tests/unit_tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path

import type_infer
from type_infer.rule_based.helpers import tokenize_text


class TestDates(unittest.TestCase):
Expand All @@ -19,10 +18,3 @@ def test_versions_are_in_sync(self):
package_init_version = type_infer.__version__

self.assertEqual(package_init_version, pyproject_version)

def test_get_tokens(self):
sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
for sent in sentences:
assert list(tokenize_text(sent)) == ['hello', 'world']

assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
3 changes: 0 additions & 3 deletions type_infer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
from type_infer import api
from type_infer import helpers

from type_infer.api import ENGINES

__version__ = '0.0.18'


__all__ = [
'__version__',
'base', 'dtype', 'api', 'helpers',
'ENGINES'
]
6 changes: 1 addition & 5 deletions type_infer/api.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
from typing import Dict, Optional
import pandas as pd

from type_infer.base import TypeInformation
from type_infer.base import TypeInformation, ENGINES
from type_infer.rule_based.core import RuleBasedEngine


class ENGINES:
RULE_BASED = 'rule_based'


def infer_types(
data: pd.DataFrame,
config: Optional[Dict] = None
Expand Down
4 changes: 4 additions & 0 deletions type_infer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ def __init__(self, stable = True):
def infer(self, df) -> TypeInformation:
"""Given a dataframe, infer the types of each column and return a TypeInformation object."""
raise NotImplementedError


class ENGINES:
RULE_BASED = 'rule_based'
9 changes: 9 additions & 0 deletions type_infer/bert/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from type_infer.base import BaseEngine


class BERType(BaseEngine):
def __init__(self, stable=False):
super().__init__(stable=stable)

def infer(self, df):
raise NotImplementedError
1 change: 0 additions & 1 deletion type_infer/bert/infer.py

This file was deleted.

3 changes: 2 additions & 1 deletion type_infer/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,5 @@ class dtype:
empty = "empty"
invalid = "invalid"

# TODO: introduce "modifiers"?

# TODO: modifier class + system
4 changes: 1 addition & 3 deletions type_infer/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os

import pandas as pd
import psutil
import random
import logging
Expand All @@ -9,12 +7,12 @@
from typing import Iterable

import numpy as np
import pandas as pd
from scipy.stats import norm


def initialize_log():
pid = os.getpid()

handler = colorlog.StreamHandler()
handler.setFormatter(colorlog.ColoredFormatter())

Expand Down
11 changes: 5 additions & 6 deletions type_infer/rule_based/core.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import re
import imghdr
import sndhdr
from collections import Counter
from typing import List, Union
import multiprocessing as mp
from typing import List, Union
from collections import Counter

import pandas as pd
import numpy as np
import pandas as pd

from type_infer.dtype import dtype
from type_infer.base import BaseEngine, TypeInformation
from type_infer.helpers import log, seed, sample_data, get_nr_procs
from type_infer.helpers import log, seed, sample_data, get_nr_procs, is_nan_numeric, cast_string_to_python_type
from type_infer.rule_based.helpers import get_language_dist, analyze_sentences, get_identifier_description_mp
from type_infer.helpers import is_nan_numeric, cast_string_to_python_type


class RuleBasedEngine(BaseEngine):
Expand Down Expand Up @@ -284,7 +283,7 @@ def count_data_types_in_column(self, data):
return dtype_counts


def get_column_data_type(self, data: Union[np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float):
def get_column_data_type(self, data: Union[pd.Series, np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float):
"""
Provided the column data, define its data type and data subtype.
Expand Down
5 changes: 2 additions & 3 deletions type_infer/rule_based/helpers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import re
import nltk
import string
from typing import Iterable
from collections import Counter, defaultdict

import numpy as np
import scipy.stats as st
from langid.langid import LanguageIdentifier
from langid.langid import model as langid_model

from typing import Iterable
from collections import Counter, defaultdict

from type_infer.dtype import dtype


Expand Down

0 comments on commit c415465

Please sign in to comment.