Merge pull request #56 from mindsdb/staging

Release 0.0.16
mindsdb · Nov 29, 2023 · dc95382 · dc95382
2 parents 67f5341 + 2a2ff0c
commit dc95382
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 59 deletions.
diff --git a/.github/workflows/add_to_bugs_project.yml b/.github/workflows/add_to_bugs_project.yml
diff --git a/.github/workflows/add_to_docs_project.yml b/.github/workflows/add_to_docs_project.yml
diff --git a/.github/workflows/add_to_pr_review.yml b/.github/workflows/add_to_pr_review.yml
@@ -0,0 +1,16 @@
+name: Add Pull Requests to PR review project
+
+on:
+  pull_request:
+    types:
+      - opened
+
+jobs:
+  add-to-project:
+    name: Add issue to project
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/[email protected]
+        with:
+          project-url: https://github.com/orgs/mindsdb/projects/65
+          github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
diff --git a/.github/workflows/add_to_roadmap_project.yml → ...b/workflows/add_to_roadmap_project_v2.yml b/.github/workflows/add_to_roadmap_project.yml → ...b/workflows/add_to_roadmap_project_v2.yml
@@ -1,19 +1,14 @@
 name: Add issue to roadmap project
-
 on:
   issues:
     types:
       - opened
-
 jobs:
   add-to-project:
     name: Add issue to roadmap project
     runs-on: ubuntu-latest
     steps:
       - uses: actions/[email protected]
         with:
-          # You can target a repository in a different organization
-          # to the issue
-          project-url: https://github.com/orgs/mindsdb/projects/54
-          github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
-          labeled: enhancement
+          project-url: https://github.com/orgs/mindsdb/projects/53
+          github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -22,7 +22,7 @@ jobs:
       run: |
         sudo apt install pandoc
         python -m pip install --upgrade pip
-        pip install install 'Sphinx==4.1.2' 'sphinx-autoapi==1.8.4' 'sphinx-autodoc-typehints==1.12.0' 'sphinx-code-include==1.1.1' 'sphinx-rtd-theme==0.5.2' 'sphinxcontrib-applehelp==1.0.2' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.0' 'sphinxcontrib-jsmath==1.0.1' 'sphinxcontrib-napoleon==0.7' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec
+        pip install install 'Sphinx==6.2.1' 'sphinx-autoapi==3.0.0' 'sphinx-autodoc-typehints' 'sphinx-code-include' 'sphinx-rtd-theme' 'sphinxcontrib-applehelp' 'sphinxcontrib-devhelp' 'sphinxcontrib-htmlhelp' 'sphinxcontrib-jsmath' 'sphinxcontrib-napoleon' 'sphinxcontrib-qthelp' 'sphinxcontrib-serializinghtml' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec
         pip install --no-cache-dir -e .
     - name: Make the docs
       run: |

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "type_infer"
-version = "0.0.15"
+version = "0.0.16"
 description = "Automated type inference for Machine Learning pipelines."
 authors = ["MindsDB Inc. <[email protected]>"]
 license = "GPL-3.0"

diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/test_dates.py
@@ -5,5 +5,43 @@
 
 
 class TestDates(unittest.TestCase):
+
     def test_0_type_check_dates(self):
+        """ Checks parsing of string containing a date to dtype 'date'.
+        """
         self.assertEqual(type_check_date('31/12/2010'), dtype.date)
+
+    def test_1_type_check_datetime(self):
+        """ Checks parsing of string containing a date to dtype 'datetime'.
+        """
+        self.assertEqual(type_check_date('31/12/2010 23:15:41'), dtype.datetime)
+
+    def test_2_type_check_timestamp_unix_seconds(self):
+        """ Checks parsing a number containing 1989-12-15T07:30:00 (as seconds
+            since Unix epoch) to dtype 'datetime'.
+        """
+        self.assertEqual(type_check_date(629721000.0), dtype.datetime)
+
+    def test_3_type_check_timestamp_unix_miliseconds(self):
+        """ Checks parsing a number containing 1989-12-15T07:30:00 (as miliseconds
+            since Unix epoch) to dtype 'datetime'.
+        """
+        self.assertEqual(type_check_date(629721000000.0), dtype.datetime)
+
+    def test_4_type_check_timestamp_unix_microseconds(self):
+        """ Checks parsing a number containing 1989-12-15T07:30:00 (as microseconds
+            since Unix epoch) to dtype 'datetime'.
+        """
+        self.assertEqual(type_check_date(629721000000000.0), dtype.datetime)
+
+    def test_5_type_check_timestamp_unix_nanoseconds(self):
+        """ Checks parsing a number containing 1989-12-15T07:30:00 (as nanoseconds
+            since Unix epoch) to dtype 'datetime'.
+        """
+        self.assertEqual(type_check_date(629721000000000000.0), dtype.datetime)
+
+    def test_6_type_check_timestamp_julian_days(self):
+        """ Checks parsing a number containing 1989-12-15T07:30:00 (as days since
+            Julian calendar epoch) to dtype 'datetime'.
+        """
+        self.assertEqual(type_check_date(2447875.81250), dtype.datetime)
diff --git a/type_infer/__init__.py b/type_infer/__init__.py
@@ -4,7 +4,7 @@
 from type_infer import helpers
 
 
-__version__ = '0.0.15'
+__version__ = '0.0.16'
 
 
 __all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']
diff --git a/type_infer/infer.py b/type_infer/infer.py
@@ -128,17 +128,57 @@ def type_check_sequence(element: object) -> str:
 
 
 def type_check_date(element: object) -> str:
+    """
+    Check if element corresponds to a date-like object.
+    """
+    # check if element represents a date (no hour/minute/seconds)
+    is_date = False
+    # check if element represents a datetime (has hour/minute/seconds)
+    is_datetime = False
+    # check if it makes sense to convert element to unix time-stamp by
+    # evaluating if, when converted, the element represents a number that
+    # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
+    # note that we also check the number is not larger than the "epochalypse time",
+    # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
+    # this because timestamps outside this range are likely to be unreliable and hence
+    # rather treated as every-day numbers.
+    min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True)
+    max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True)
+    valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix',
+                   'D': 'julian'}
+    for unit, origin in valid_units.items():
+        try:
+            as_dt = pd.to_datetime(element, unit=unit, origin=origin,
+                                   errors='raise')
+            if min_dt < as_dt < max_dt:
+                is_datetime = True
+                break
+        except Exception:
+            pass
+    # check if element represents a date-like object.
+    # here we don't check for a validity range like with unix-timestamps
+    # because dates as string usually represent something more general than
+    # just the number of seconds since an epoch.
     try:
-        dt = pd.to_datetime(element)
-
-        # Not accurate 100% for a single datetime str, but should work in aggregate
-        if dt.hour == 0 and dt.minute == 0 and dt.second == 0 and len(str(element)) <= 16:
-            return dtype.date
-        else:
-            return dtype.datetime
-
-    except ValueError:
-        return None
+        as_dt = pd.to_datetime(element, errors='raise')
+        is_datetime = True
+    except Exception:
+        pass
+    # finally, if element is represents a datetime object, check if only
+    # date part is contained (no time information)
+    if is_datetime:
+        # round element day (drop hour/minute/second)
+        dt_d = as_dt.to_period('D').to_timestamp()
+        # if rounded datetime equals the datetime itself, it means there was not
+        # hour/minute/second information to begin with. Mind the 'localize' to
+        # avoid time-zone BS to kick in.
+        is_date = dt_d == as_dt.tz_localize(None)
+    if is_date:
+        return dtype.date
+    if is_datetime:
+        return dtype.datetime
+
+    return None
 
 
 def count_data_types_in_column(data):
@@ -391,7 +431,7 @@ def infer_types(
     population_size = len(data)
     log.info(f'Analyzing a sample of {sample_size}')
     log.info(
-        f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
+        f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.')  # noqa
 
     nr_procs = get_nr_procs(df=sample_df)
     pool_size = min(nr_procs, len(sample_df.columns.values))