From 5d99995a2c7a46ff3fe5e4317e68d4ce24f02872 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 16:09:31 +0000 Subject: [PATCH 01/16] added support for timestamps as time (seconds, ms, us, ns and days) since epoch (unix or Julian) --- tests/unit_tests/test_dates.py | 38 ++++++++++++++++++ type_infer/dtype.py | 2 + type_infer/infer.py | 70 +++++++++++++++++++++++++++++----- 3 files changed, 100 insertions(+), 10 deletions(-) diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/test_dates.py index 14959ab..8a06563 100644 --- a/tests/unit_tests/test_dates.py +++ b/tests/unit_tests/test_dates.py @@ -5,5 +5,43 @@ class TestDates(unittest.TestCase): + def test_0_type_check_dates(self): + """ Checks parsing of string containing a date to dtype 'date'. + """ self.assertEqual(type_check_date('31/12/2010'), dtype.date) + + def test_1_type_check_datetime(self): + """ Checks parsing of string containing a date to dtype 'datetime'. + """ + self.assertEqual(type_check_date('31/12/2010 23:15:41'), dtype.datetime) + + def test_2_type_check_timestamp_unix_seconds(self): + """ Checks parsing a number containing 1989-12-15T07:30:00 (as seconds + since Unix epoch) to dtype 'timestamp'. + """ + self.assertEqual(type_check_date(629721000.0), dtype.timestamp) + + def test_3_type_check_timestamp_unix_miliseconds(self): + """ Checks parsing a number containing 1989-12-15T07:30:00 (as miliseconds + since Unix epoch) to dtype 'timestamp'. + """ + self.assertEqual(type_check_date(629721000000.0), dtype.timestamp) + + def test_4_type_check_timestamp_unix_microseconds(self): + """ Checks parsing a number containing 1989-12-15T07:30:00 (as microseconds + since Unix epoch) to dtype 'timestamp'. + """ + self.assertEqual(type_check_date(629721000000000.0), dtype.timestamp) + + def test_5_type_check_timestamp_unix_nanoseconds(self): + """ Checks parsing a number containing 1989-12-15T07:30:00 (as nanoseconds + since Unix epoch) to dtype 'timestamp'. + """ + self.assertEqual(type_check_date(629721000000000000.0), dtype.timestamp) + + def test_6_type_check_timestamp_julian_days(self): + """ Checks parsing a number containing 1989-12-15T07:30:00 (as days since + Julian calendar epoch) to dtype 'timestamp'. + """ + self.assertEqual(type_check_date(2447875.81250), dtype.timestamp) \ No newline at end of file diff --git a/type_infer/dtype.py b/type_infer/dtype.py index 9f05a2e..1f4f460 100644 --- a/type_infer/dtype.py +++ b/type_infer/dtype.py @@ -5,6 +5,7 @@ class dtype: - **Numerical**: Data that should be represented in the form of a number. Currently ``integer``, ``float``, and ``quantity`` are supported. - **Categorical**: Data that represents a class or label and is discrete. Currently ``binary``, ``categorical``, and ``tags`` are supported. - **Date/Time**: Time-series data that is temporal/sequential. Currently ``date``, and ``datetime`` are supported. + - **Timestamp**: Data that represents time in the form of the amount of nano/micro/milli-seconds, seconds after midnight 1970-01-01. Julian days are also supported. - **Text**: Data that can be considered as language information. Currently ``short_text``, and ``rich_text`` are supported. Short text has a small vocabulary (~ 100 words) and is generally a limited number of characters. Rich text is anything with greater complexity. - **Complex**: Data types that require custom techniques. Currently ``audio``, ``video`` and ``image`` are available, but highly experimental. - **Array**: Data in the form of a sequence where order must be preserved. ``tsarray`` dtypes are for "normal" columns that will be transformed to arrays at a row-level because they will be treated as time series. @@ -26,6 +27,7 @@ class dtype: # Dates and Times (time-series) date = "date" datetime = "datetime" + timestamp = "timestamp" # Text short_text = "short_text" diff --git a/type_infer/infer.py b/type_infer/infer.py index b54f1b0..226bed4 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -128,18 +128,68 @@ def type_check_sequence(element: object) -> str: def type_check_date(element: object) -> str: + """ + Check if element corresponds to a date-like object. + """ + # check if element represents a unix-timestamp + isTimestamp = False + # check if element represents a date (no hour/minute/seconds) + isDate = False + # check if element represents a datetime (has hour/minute/seconds) + isDatetime = False + + # check if it makes sense to convert element to unix time-stamp by + # evaluating if, when converted, the element represents a number + # that is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00) + # note that we also check the number is not larger than the "epochalypse time", + # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do + # this because timestamps outside this range are likely to be unreliable and hence + # rather treated as every-day numbers. try: - dt = pd.to_datetime(element) - - # Not accurate 100% for a single datetime str, but should work in aggregate - if dt.hour == 0 and dt.minute == 0 and dt.second == 0 and len(str(element)) <= 16: - return dtype.date + unt = '' + for unt in ['ns', 'us', 'ms', 's']: + dt = pd.to_datetime(element, unit=unt, origin='unix') + if ((dt > pd.to_datetime('1970-01-01T:00:00:00', utc=True)) and \ + (dt < pd.to_datetime('2038-01-19T03:14:08', utc=True))): + isTimestamp = True + break + # yes some kind of people still use Julian Days + dt = pd.to_datetime(element, unit='D', origin='julian') + if ((dt > pd.to_datetime('1970-01-01T:00:00:00', utc=True)) and \ + (dt < pd.to_datetime('2038-01-19T03:14:08', utc=True))): + isTimestamp = True + except Exception as error: + pass + # check if element represents a date-like object. + # here we don't check for a validity range like with unix-timestamps + # because dates as string usually represent something more general than + # just the number of seconds since an epoch. + try: + dt = pd.to_datetime(element, errors='raise') + # round element day (drop hour/minute/second) + dtd = dt.to_period('D').to_timestamp() + # if rounded datetime equals the datetime itself, it means there was not + # hour/minute/second information to begin with. Mind the 'localize' to + # avoid time-zone BS to kick in. + if dtd == dt.tz_localize(None): + isDate = True else: - return dtype.datetime - - except ValueError: - return None - + isDatetime = True + except Exception as error: + pass + + # because of the explicit 'unit' argument when checking for timestamps, + # element cannot be timestamp AND date/datetime. Similarly, it cannot + # be both date and datetime. + rtype = None + if isTimestamp: + rtype = dtype.timestamp + if isDatetime: + rtype = dtype.datetime + if isDate: + rtype = dtype.date + + return rtype def count_data_types_in_column(data): dtype_counts = Counter() From 3198216b4edba396259fca3284af36969fcebee5 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 19:34:44 +0000 Subject: [PATCH 02/16] minor refactor to comply with PyLint --- type_infer/infer.py | 75 +++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/type_infer/infer.py b/type_infer/infer.py index 226bed4..d426f4a 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -132,65 +132,72 @@ def type_check_date(element: object) -> str: Check if element corresponds to a date-like object. """ # check if element represents a unix-timestamp - isTimestamp = False + is_timestamp = False # check if element represents a date (no hour/minute/seconds) - isDate = False + is_date = False # check if element represents a datetime (has hour/minute/seconds) - isDatetime = False + is_datetime = False - # check if it makes sense to convert element to unix time-stamp by - # evaluating if, when converted, the element represents a number - # that is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00) + # check if it makes sense to convert element to unix time-stamp by + # evaluating if, when converted, the element represents a number that + # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00) # note that we also check the number is not larger than the "epochalypse time", # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do # this because timestamps outside this range are likely to be unreliable and hence # rather treated as every-day numbers. - try: - unt = '' - for unt in ['ns', 'us', 'ms', 's']: - dt = pd.to_datetime(element, unit=unt, origin='unix') - if ((dt > pd.to_datetime('1970-01-01T:00:00:00', utc=True)) and \ - (dt < pd.to_datetime('2038-01-19T03:14:08', utc=True))): - isTimestamp = True - break - # yes some kind of people still use Julian Days - dt = pd.to_datetime(element, unit='D', origin='julian') - if ((dt > pd.to_datetime('1970-01-01T:00:00:00', utc=True)) and \ - (dt < pd.to_datetime('2038-01-19T03:14:08', utc=True))): - isTimestamp = True - except Exception as error: - pass - # check if element represents a date-like object. + min_dt = pd.to_datetime('1970-01-01T:00:00:00', utc=True) + max_dt = pd.to_datetime('2038-01-19T:03:14:08', utc=True) + valid_units = ['ns', 'us', 'ms', 's', 'D'] + for unit in valid_units: + # Yes, some people still use Julian Days... + if unit == 'D': + try: + as_dt = pd.to_datetime(element, unit=unit, origin='julian', errors='raise') + if min_dt < as_dt < max_dt: + is_timestamp = True + break + except Exception: + pass + else: + try: + as_dt = pd.to_datetime(element, unit=unit, origin='unix', errors='raise') + if min_dt < as_dt < max_dt: + is_timestamp = True + break + except Exception: + pass + # check if element represents a date-like object. # here we don't check for a validity range like with unix-timestamps # because dates as string usually represent something more general than # just the number of seconds since an epoch. try: - dt = pd.to_datetime(element, errors='raise') + as_dt = pd.to_datetime(element, errors='raise') # round element day (drop hour/minute/second) - dtd = dt.to_period('D').to_timestamp() + dt_d = as_dt.to_period('D').to_timestamp() # if rounded datetime equals the datetime itself, it means there was not # hour/minute/second information to begin with. Mind the 'localize' to # avoid time-zone BS to kick in. - if dtd == dt.tz_localize(None): - isDate = True + if dt_d == as_dt.tz_localize(None): + is_date = True else: - isDatetime = True - except Exception as error: + is_datetime = True + except Exception: pass - + # because of the explicit 'unit' argument when checking for timestamps, # element cannot be timestamp AND date/datetime. Similarly, it cannot # be both date and datetime. rtype = None - if isTimestamp: + if is_timestamp: rtype = dtype.timestamp - if isDatetime: + if is_datetime: rtype = dtype.datetime - if isDate: + if is_date: rtype = dtype.date - + return rtype + def count_data_types_in_column(data): dtype_counts = Counter() @@ -441,7 +448,7 @@ def infer_types( population_size = len(data) log.info(f'Analyzing a sample of {sample_size}') log.info( - f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa + f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa nr_procs = get_nr_procs(df=sample_df) pool_size = min(nr_procs, len(sample_df.columns.values)) From 1a45efe66562241cb22e352a1a7d5d433c6262ba Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 19:37:48 +0000 Subject: [PATCH 03/16] removed whitespaces --- tests/unit_tests/test_dates.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/test_dates.py index 8a06563..2b79b5b 100644 --- a/tests/unit_tests/test_dates.py +++ b/tests/unit_tests/test_dates.py @@ -10,18 +10,18 @@ def test_0_type_check_dates(self): """ Checks parsing of string containing a date to dtype 'date'. """ self.assertEqual(type_check_date('31/12/2010'), dtype.date) - + def test_1_type_check_datetime(self): """ Checks parsing of string containing a date to dtype 'datetime'. """ self.assertEqual(type_check_date('31/12/2010 23:15:41'), dtype.datetime) - + def test_2_type_check_timestamp_unix_seconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as seconds since Unix epoch) to dtype 'timestamp'. """ self.assertEqual(type_check_date(629721000.0), dtype.timestamp) - + def test_3_type_check_timestamp_unix_miliseconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as miliseconds since Unix epoch) to dtype 'timestamp'. @@ -33,15 +33,15 @@ def test_4_type_check_timestamp_unix_microseconds(self): since Unix epoch) to dtype 'timestamp'. """ self.assertEqual(type_check_date(629721000000000.0), dtype.timestamp) - + def test_5_type_check_timestamp_unix_nanoseconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as nanoseconds since Unix epoch) to dtype 'timestamp'. """ self.assertEqual(type_check_date(629721000000000000.0), dtype.timestamp) - + def test_6_type_check_timestamp_julian_days(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as days since Julian calendar epoch) to dtype 'timestamp'. """ - self.assertEqual(type_check_date(2447875.81250), dtype.timestamp) \ No newline at end of file + self.assertEqual(type_check_date(2447875.81250), dtype.timestamp) From 9c4223a574bb9506d9936d266eb2c070378368a4 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 19:41:47 +0000 Subject: [PATCH 04/16] typo in min_dt and max_dt --- type_infer/infer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/type_infer/infer.py b/type_infer/infer.py index d426f4a..2998439 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -145,8 +145,8 @@ def type_check_date(element: object) -> str: # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do # this because timestamps outside this range are likely to be unreliable and hence # rather treated as every-day numbers. - min_dt = pd.to_datetime('1970-01-01T:00:00:00', utc=True) - max_dt = pd.to_datetime('2038-01-19T:03:14:08', utc=True) + min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True) + max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True) valid_units = ['ns', 'us', 'ms', 's', 'D'] for unit in valid_units: # Yes, some people still use Julian Days... From 0b164440b541eac24b64280c4ad88fa9b64d8974 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 19:57:37 +0000 Subject: [PATCH 05/16] prioritize returning timestamps when found. --- type_infer/infer.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/type_infer/infer.py b/type_infer/infer.py index 2998439..4cc879d 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -166,6 +166,9 @@ def type_check_date(element: object) -> str: break except Exception: pass + if is_timestamp: + return dtype.timestamp + # check if element represents a date-like object. # here we don't check for a validity range like with unix-timestamps # because dates as string usually represent something more general than @@ -184,19 +187,12 @@ def type_check_date(element: object) -> str: except Exception: pass - # because of the explicit 'unit' argument when checking for timestamps, - # element cannot be timestamp AND date/datetime. Similarly, it cannot - # be both date and datetime. - rtype = None - if is_timestamp: - rtype = dtype.timestamp if is_datetime: - rtype = dtype.datetime + return dtype.datetime if is_date: - rtype = dtype.date - - return rtype + return dtype.date + return None def count_data_types_in_column(data): dtype_counts = Counter() From f5e02d0bca38a59142263f87f9ab57f01941a989 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 20:05:18 +0000 Subject: [PATCH 06/16] forced flake8 checks... --- type_infer/infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/type_infer/infer.py b/type_infer/infer.py index 4cc879d..c3af676 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -194,6 +194,7 @@ def type_check_date(element: object) -> str: return None + def count_data_types_in_column(data): dtype_counts = Counter() From e2d236a88a8747bc487a334cc9044c405e66e81b Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Wed, 16 Aug 2023 20:22:25 +0000 Subject: [PATCH 07/16] datetime and timestamp are the same --- tests/unit_tests/test_dates.py | 20 ++++++++++---------- type_infer/dtype.py | 2 -- type_infer/infer.py | 28 +++++++++++----------------- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/test_dates.py index 2b79b5b..6e8bfe7 100644 --- a/tests/unit_tests/test_dates.py +++ b/tests/unit_tests/test_dates.py @@ -18,30 +18,30 @@ def test_1_type_check_datetime(self): def test_2_type_check_timestamp_unix_seconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as seconds - since Unix epoch) to dtype 'timestamp'. + since Unix epoch) to dtype 'datetime'. """ - self.assertEqual(type_check_date(629721000.0), dtype.timestamp) + self.assertEqual(type_check_date(629721000.0), dtype.datetime) def test_3_type_check_timestamp_unix_miliseconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as miliseconds - since Unix epoch) to dtype 'timestamp'. + since Unix epoch) to dtype 'datetime'. """ - self.assertEqual(type_check_date(629721000000.0), dtype.timestamp) + self.assertEqual(type_check_date(629721000000.0), dtype.datetime) def test_4_type_check_timestamp_unix_microseconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as microseconds - since Unix epoch) to dtype 'timestamp'. + since Unix epoch) to dtype 'datetime'. """ - self.assertEqual(type_check_date(629721000000000.0), dtype.timestamp) + self.assertEqual(type_check_date(629721000000000.0), dtype.datetime) def test_5_type_check_timestamp_unix_nanoseconds(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as nanoseconds - since Unix epoch) to dtype 'timestamp'. + since Unix epoch) to dtype 'datetime'. """ - self.assertEqual(type_check_date(629721000000000000.0), dtype.timestamp) + self.assertEqual(type_check_date(629721000000000000.0), dtype.datetime) def test_6_type_check_timestamp_julian_days(self): """ Checks parsing a number containing 1989-12-15T07:30:00 (as days since - Julian calendar epoch) to dtype 'timestamp'. + Julian calendar epoch) to dtype 'datetime'. """ - self.assertEqual(type_check_date(2447875.81250), dtype.timestamp) + self.assertEqual(type_check_date(2447875.81250), dtype.datetime) diff --git a/type_infer/dtype.py b/type_infer/dtype.py index 1f4f460..9f05a2e 100644 --- a/type_infer/dtype.py +++ b/type_infer/dtype.py @@ -5,7 +5,6 @@ class dtype: - **Numerical**: Data that should be represented in the form of a number. Currently ``integer``, ``float``, and ``quantity`` are supported. - **Categorical**: Data that represents a class or label and is discrete. Currently ``binary``, ``categorical``, and ``tags`` are supported. - **Date/Time**: Time-series data that is temporal/sequential. Currently ``date``, and ``datetime`` are supported. - - **Timestamp**: Data that represents time in the form of the amount of nano/micro/milli-seconds, seconds after midnight 1970-01-01. Julian days are also supported. - **Text**: Data that can be considered as language information. Currently ``short_text``, and ``rich_text`` are supported. Short text has a small vocabulary (~ 100 words) and is generally a limited number of characters. Rich text is anything with greater complexity. - **Complex**: Data types that require custom techniques. Currently ``audio``, ``video`` and ``image`` are available, but highly experimental. - **Array**: Data in the form of a sequence where order must be preserved. ``tsarray`` dtypes are for "normal" columns that will be transformed to arrays at a row-level because they will be treated as time series. @@ -27,7 +26,6 @@ class dtype: # Dates and Times (time-series) date = "date" datetime = "datetime" - timestamp = "timestamp" # Text short_text = "short_text" diff --git a/type_infer/infer.py b/type_infer/infer.py index c3af676..836f9e1 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -131,13 +131,10 @@ def type_check_date(element: object) -> str: """ Check if element corresponds to a date-like object. """ - # check if element represents a unix-timestamp - is_timestamp = False # check if element represents a date (no hour/minute/seconds) is_date = False # check if element represents a datetime (has hour/minute/seconds) is_datetime = False - # check if it makes sense to convert element to unix time-stamp by # evaluating if, when converted, the element represents a number that # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00) @@ -154,7 +151,7 @@ def type_check_date(element: object) -> str: try: as_dt = pd.to_datetime(element, unit=unit, origin='julian', errors='raise') if min_dt < as_dt < max_dt: - is_timestamp = True + is_datetime = True break except Exception: pass @@ -162,35 +159,32 @@ def type_check_date(element: object) -> str: try: as_dt = pd.to_datetime(element, unit=unit, origin='unix', errors='raise') if min_dt < as_dt < max_dt: - is_timestamp = True + is_datetime = True break except Exception: pass - if is_timestamp: - return dtype.timestamp - # check if element represents a date-like object. # here we don't check for a validity range like with unix-timestamps # because dates as string usually represent something more general than # just the number of seconds since an epoch. try: as_dt = pd.to_datetime(element, errors='raise') + is_datetime = True + except Exception: + pass + # finally, if element is represents a datetime object, check if only + # date part is contained (no time information) + if is_datetime: # round element day (drop hour/minute/second) dt_d = as_dt.to_period('D').to_timestamp() # if rounded datetime equals the datetime itself, it means there was not # hour/minute/second information to begin with. Mind the 'localize' to # avoid time-zone BS to kick in. - if dt_d == as_dt.tz_localize(None): - is_date = True - else: - is_datetime = True - except Exception: - pass - - if is_datetime: - return dtype.datetime + is_date = dt_d == as_dt.tz_localize(None) if is_date: return dtype.date + if is_datetime: + return dtype.datetime return None From 7ecbf7ffc86bb61d73da221fdca98c16a530a522 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Mon, 21 Aug 2023 15:52:07 +0000 Subject: [PATCH 08/16] minor change to de-duplicate code. --- type_infer/infer.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/type_infer/infer.py b/type_infer/infer.py index 836f9e1..b89ce9a 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -144,25 +144,17 @@ def type_check_date(element: object) -> str: # rather treated as every-day numbers. min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True) max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True) - valid_units = ['ns', 'us', 'ms', 's', 'D'] - for unit in valid_units: - # Yes, some people still use Julian Days... - if unit == 'D': - try: - as_dt = pd.to_datetime(element, unit=unit, origin='julian', errors='raise') - if min_dt < as_dt < max_dt: - is_datetime = True - break - except Exception: - pass - else: - try: - as_dt = pd.to_datetime(element, unit=unit, origin='unix', errors='raise') - if min_dt < as_dt < max_dt: - is_datetime = True - break - except Exception: - pass + valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix', + # Yes, some people still use Julian Days... + 'D': 'julian'} + for unit, origin in valid_units.items(): + try: + as_dt = pd.to_datetime(element, unit=unit, origin=origin, errors='raise') + if min_dt < as_dt < max_dt: + is_datetime = True + break + except Exception: + pass # check if element represents a date-like object. # here we don't check for a validity range like with unix-timestamps # because dates as string usually represent something more general than From 4f1fe29886d76d39a37db2917b9c81af14464103 Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Mon, 21 Aug 2023 15:52:39 +0000 Subject: [PATCH 09/16] fixed typo --- type_infer/infer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/type_infer/infer.py b/type_infer/infer.py index b89ce9a..33dd73b 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -145,7 +145,6 @@ def type_check_date(element: object) -> str: min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True) max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True) valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix', - # Yes, some people still use Julian Days... 'D': 'julian'} for unit, origin in valid_units.items(): try: From d6ed8fa41913585846d09047358962970d14dddb Mon Sep 17 00:00:00 2001 From: Pedro Fluxa Date: Mon, 21 Aug 2023 15:53:11 +0000 Subject: [PATCH 10/16] conform to pylint and flake8 --- type_infer/infer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/type_infer/infer.py b/type_infer/infer.py index 33dd73b..dd329f3 100644 --- a/type_infer/infer.py +++ b/type_infer/infer.py @@ -148,7 +148,8 @@ def type_check_date(element: object) -> str: 'D': 'julian'} for unit, origin in valid_units.items(): try: - as_dt = pd.to_datetime(element, unit=unit, origin=origin, errors='raise') + as_dt = pd.to_datetime(element, unit=unit, origin=origin, + errors='raise') if min_dt < as_dt < max_dt: is_datetime = True break From 6290c8595fc6b8ab434243cdac10bac94cd562ff Mon Sep 17 00:00:00 2001 From: Tom Hudson <34073127+tomhuds@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:39:18 -0700 Subject: [PATCH 11/16] Delete .github/workflows/add_to_bugs_project.yml --- .github/workflows/add_to_bugs_project.yml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 .github/workflows/add_to_bugs_project.yml diff --git a/.github/workflows/add_to_bugs_project.yml b/.github/workflows/add_to_bugs_project.yml deleted file mode 100644 index 1326053..0000000 --- a/.github/workflows/add_to_bugs_project.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: Add issue to bugs project - -on: - issues: - types: - - opened - -jobs: - add-to-project: - name: Add issue to bugs project - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v0.4.0 - with: - # You can target a repository in a different organization - # to the issue - project-url: https://github.com/orgs/mindsdb/projects/53 - github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} - labeled: bug From 1f5d2713273090a617faacfe82c5d5cbbb2d9249 Mon Sep 17 00:00:00 2001 From: Tom Hudson <34073127+tomhuds@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:39:33 -0700 Subject: [PATCH 12/16] Delete .github/workflows/add_to_docs_project.yml --- .github/workflows/add_to_docs_project.yml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 .github/workflows/add_to_docs_project.yml diff --git a/.github/workflows/add_to_docs_project.yml b/.github/workflows/add_to_docs_project.yml deleted file mode 100644 index ac34b2e..0000000 --- a/.github/workflows/add_to_docs_project.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: Add issue to docs project - -on: - issues: - types: - - opened - -jobs: - add-to-project: - name: Add issue to docs project - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v0.4.0 - with: - # You can target a repository in a different organization - # to the issue - project-url: https://github.com/orgs/mindsdb/projects/32 - github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} - labeled: documentation From ab9c8dc73ae98b45698dd67c1a8a1afde85513d1 Mon Sep 17 00:00:00 2001 From: Tom Hudson <34073127+tomhuds@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:39:42 -0700 Subject: [PATCH 13/16] Delete .github/workflows/add_to_roadmap_project.yml --- .github/workflows/add_to_roadmap_project.yml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 .github/workflows/add_to_roadmap_project.yml diff --git a/.github/workflows/add_to_roadmap_project.yml b/.github/workflows/add_to_roadmap_project.yml deleted file mode 100644 index 4aec947..0000000 --- a/.github/workflows/add_to_roadmap_project.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: Add issue to roadmap project - -on: - issues: - types: - - opened - -jobs: - add-to-project: - name: Add issue to roadmap project - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v0.4.0 - with: - # You can target a repository in a different organization - # to the issue - project-url: https://github.com/orgs/mindsdb/projects/54 - github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} - labeled: enhancement From 20142b85018ed6c9ae5b02dab67da47112bc6e43 Mon Sep 17 00:00:00 2001 From: Tom Hudson <34073127+tomhuds@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:39:56 -0700 Subject: [PATCH 14/16] Add files via upload --- .github/workflows/add_to_pr_review.yml | 16 ++++++++++++++++ .github/workflows/add_to_roadmap_project_v2.yml | 14 ++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 .github/workflows/add_to_pr_review.yml create mode 100644 .github/workflows/add_to_roadmap_project_v2.yml diff --git a/.github/workflows/add_to_pr_review.yml b/.github/workflows/add_to_pr_review.yml new file mode 100644 index 0000000..384f2be --- /dev/null +++ b/.github/workflows/add_to_pr_review.yml @@ -0,0 +1,16 @@ +name: Add Pull Requests to PR review project + +on: + pull_request: + types: + - opened + +jobs: + add-to-project: + name: Add issue to project + runs-on: ubuntu-latest + steps: + - uses: actions/add-to-project@v0.5.0 + with: + project-url: https://github.com/orgs/mindsdb/projects/65 + github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} diff --git a/.github/workflows/add_to_roadmap_project_v2.yml b/.github/workflows/add_to_roadmap_project_v2.yml new file mode 100644 index 0000000..240c700 --- /dev/null +++ b/.github/workflows/add_to_roadmap_project_v2.yml @@ -0,0 +1,14 @@ +name: Add issue to roadmap project +on: + issues: + types: + - opened +jobs: + add-to-project: + name: Add issue to roadmap project + runs-on: ubuntu-latest + steps: + - uses: actions/add-to-project@v0.4.0 + with: + project-url: https://github.com/orgs/mindsdb/projects/53 + github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} \ No newline at end of file From 01cec97ffc0e97876402d2545218af458b8a5b6f Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 27 Nov 2023 13:54:35 +0900 Subject: [PATCH 15/16] version bump: 0.0.16 --- pyproject.toml | 2 +- type_infer/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3cb3478..61d3bf9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "type_infer" -version = "0.0.15" +version = "0.0.16" description = "Automated type inference for Machine Learning pipelines." authors = ["MindsDB Inc. "] license = "GPL-3.0" diff --git a/type_infer/__init__.py b/type_infer/__init__.py index e163350..2e45234 100644 --- a/type_infer/__init__.py +++ b/type_infer/__init__.py @@ -4,7 +4,7 @@ from type_infer import helpers -__version__ = '0.0.15' +__version__ = '0.0.16' __all__ = ['base', 'dtype', 'infer', 'helpers', '__version__'] From 2a2ff0c5231c4843bb6c629b3d604dccbfa4b10e Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 29 Nov 2023 19:10:39 +0900 Subject: [PATCH 16/16] update docs deps --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 226c7e3..cfbc836 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -22,7 +22,7 @@ jobs: run: | sudo apt install pandoc python -m pip install --upgrade pip - pip install install 'Sphinx==4.1.2' 'sphinx-autoapi==1.8.4' 'sphinx-autodoc-typehints==1.12.0' 'sphinx-code-include==1.1.1' 'sphinx-rtd-theme==0.5.2' 'sphinxcontrib-applehelp==1.0.2' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.0' 'sphinxcontrib-jsmath==1.0.1' 'sphinxcontrib-napoleon==0.7' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec + pip install install 'Sphinx==6.2.1' 'sphinx-autoapi==3.0.0' 'sphinx-autodoc-typehints' 'sphinx-code-include' 'sphinx-rtd-theme' 'sphinxcontrib-applehelp' 'sphinxcontrib-devhelp' 'sphinxcontrib-htmlhelp' 'sphinxcontrib-jsmath' 'sphinxcontrib-napoleon' 'sphinxcontrib-qthelp' 'sphinxcontrib-serializinghtml' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec pip install --no-cache-dir -e . - name: Make the docs run: |