Skip to content

Commit

Permalink
Merge pull request #56 from mindsdb/staging
Browse files Browse the repository at this point in the history
Release 0.0.16
  • Loading branch information
paxcema authored Nov 29, 2023
2 parents 67f5341 + 2a2ff0c commit dc95382
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 59 deletions.
19 changes: 0 additions & 19 deletions .github/workflows/add_to_bugs_project.yml

This file was deleted.

19 changes: 0 additions & 19 deletions .github/workflows/add_to_docs_project.yml

This file was deleted.

16 changes: 16 additions & 0 deletions .github/workflows/add_to_pr_review.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Add Pull Requests to PR review project

on:
pull_request:
types:
- opened

jobs:
add-to-project:
name: Add issue to project
runs-on: ubuntu-latest
steps:
- uses: actions/[email protected]
with:
project-url: https://github.com/orgs/mindsdb/projects/65
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
name: Add issue to roadmap project

on:
issues:
types:
- opened

jobs:
add-to-project:
name: Add issue to roadmap project
runs-on: ubuntu-latest
steps:
- uses: actions/[email protected]
with:
# You can target a repository in a different organization
# to the issue
project-url: https://github.com/orgs/mindsdb/projects/54
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: enhancement
project-url: https://github.com/orgs/mindsdb/projects/53
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
run: |
sudo apt install pandoc
python -m pip install --upgrade pip
pip install install 'Sphinx==4.1.2' 'sphinx-autoapi==1.8.4' 'sphinx-autodoc-typehints==1.12.0' 'sphinx-code-include==1.1.1' 'sphinx-rtd-theme==0.5.2' 'sphinxcontrib-applehelp==1.0.2' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.0' 'sphinxcontrib-jsmath==1.0.1' 'sphinxcontrib-napoleon==0.7' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec
pip install install 'Sphinx==6.2.1' 'sphinx-autoapi==3.0.0' 'sphinx-autodoc-typehints' 'sphinx-code-include' 'sphinx-rtd-theme' 'sphinxcontrib-applehelp' 'sphinxcontrib-devhelp' 'sphinxcontrib-htmlhelp' 'sphinxcontrib-jsmath' 'sphinxcontrib-napoleon' 'sphinxcontrib-qthelp' 'sphinxcontrib-serializinghtml' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec
pip install --no-cache-dir -e .
- name: Make the docs
run: |
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "type_infer"
version = "0.0.15"
version = "0.0.16"
description = "Automated type inference for Machine Learning pipelines."
authors = ["MindsDB Inc. <[email protected]>"]
license = "GPL-3.0"
Expand Down
38 changes: 38 additions & 0 deletions tests/unit_tests/test_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,43 @@


class TestDates(unittest.TestCase):

def test_0_type_check_dates(self):
""" Checks parsing of string containing a date to dtype 'date'.
"""
self.assertEqual(type_check_date('31/12/2010'), dtype.date)

def test_1_type_check_datetime(self):
""" Checks parsing of string containing a date to dtype 'datetime'.
"""
self.assertEqual(type_check_date('31/12/2010 23:15:41'), dtype.datetime)

def test_2_type_check_timestamp_unix_seconds(self):
""" Checks parsing a number containing 1989-12-15T07:30:00 (as seconds
since Unix epoch) to dtype 'datetime'.
"""
self.assertEqual(type_check_date(629721000.0), dtype.datetime)

def test_3_type_check_timestamp_unix_miliseconds(self):
""" Checks parsing a number containing 1989-12-15T07:30:00 (as miliseconds
since Unix epoch) to dtype 'datetime'.
"""
self.assertEqual(type_check_date(629721000000.0), dtype.datetime)

def test_4_type_check_timestamp_unix_microseconds(self):
""" Checks parsing a number containing 1989-12-15T07:30:00 (as microseconds
since Unix epoch) to dtype 'datetime'.
"""
self.assertEqual(type_check_date(629721000000000.0), dtype.datetime)

def test_5_type_check_timestamp_unix_nanoseconds(self):
""" Checks parsing a number containing 1989-12-15T07:30:00 (as nanoseconds
since Unix epoch) to dtype 'datetime'.
"""
self.assertEqual(type_check_date(629721000000000000.0), dtype.datetime)

def test_6_type_check_timestamp_julian_days(self):
""" Checks parsing a number containing 1989-12-15T07:30:00 (as days since
Julian calendar epoch) to dtype 'datetime'.
"""
self.assertEqual(type_check_date(2447875.81250), dtype.datetime)
2 changes: 1 addition & 1 deletion type_infer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from type_infer import helpers


__version__ = '0.0.15'
__version__ = '0.0.16'


__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']
62 changes: 51 additions & 11 deletions type_infer/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,57 @@ def type_check_sequence(element: object) -> str:


def type_check_date(element: object) -> str:
"""
Check if element corresponds to a date-like object.
"""
# check if element represents a date (no hour/minute/seconds)
is_date = False
# check if element represents a datetime (has hour/minute/seconds)
is_datetime = False
# check if it makes sense to convert element to unix time-stamp by
# evaluating if, when converted, the element represents a number that
# is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
# note that we also check the number is not larger than the "epochalypse time",
# which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
# this because timestamps outside this range are likely to be unreliable and hence
# rather treated as every-day numbers.
min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True)
max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True)
valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix',
'D': 'julian'}
for unit, origin in valid_units.items():
try:
as_dt = pd.to_datetime(element, unit=unit, origin=origin,
errors='raise')
if min_dt < as_dt < max_dt:
is_datetime = True
break
except Exception:
pass
# check if element represents a date-like object.
# here we don't check for a validity range like with unix-timestamps
# because dates as string usually represent something more general than
# just the number of seconds since an epoch.
try:
dt = pd.to_datetime(element)

# Not accurate 100% for a single datetime str, but should work in aggregate
if dt.hour == 0 and dt.minute == 0 and dt.second == 0 and len(str(element)) <= 16:
return dtype.date
else:
return dtype.datetime

except ValueError:
return None
as_dt = pd.to_datetime(element, errors='raise')
is_datetime = True
except Exception:
pass
# finally, if element is represents a datetime object, check if only
# date part is contained (no time information)
if is_datetime:
# round element day (drop hour/minute/second)
dt_d = as_dt.to_period('D').to_timestamp()
# if rounded datetime equals the datetime itself, it means there was not
# hour/minute/second information to begin with. Mind the 'localize' to
# avoid time-zone BS to kick in.
is_date = dt_d == as_dt.tz_localize(None)
if is_date:
return dtype.date
if is_datetime:
return dtype.datetime

return None


def count_data_types_in_column(data):
Expand Down Expand Up @@ -391,7 +431,7 @@ def infer_types(
population_size = len(data)
log.info(f'Analyzing a sample of {sample_size}')
log.info(
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa

nr_procs = get_nr_procs(df=sample_df)
pool_size = min(nr_procs, len(sample_df.columns.values))
Expand Down

0 comments on commit dc95382

Please sign in to comment.