6262 Timestamp ,
6363 isna ,
6464 to_datetime ,
65- to_timedelta ,
6665)
6766from pandas .core .frame import DataFrame
6867from pandas .core .indexes .base import Index
232231
233232
234233stata_epoch : Final = datetime (1960 , 1 , 1 )
234+ unix_epoch : Final = datetime (1970 , 1 , 1 )
235235
236236
237237def _stata_elapsed_date_to_datetime_vec (dates : Series , fmt : str ) -> Series :
@@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
256256 >>> dates = pd.Series([52])
257257 >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
258258 0 1961-01-01
259- dtype: datetime64[ns ]
259+ dtype: datetime64[s ]
260260
261261 Notes
262262 -----
@@ -280,76 +280,51 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
280280 date - ty
281281 years since 0000
282282 """
283- MIN_YEAR , MAX_YEAR = Timestamp .min .year , Timestamp .max .year
284- MAX_DAY_DELTA = (Timestamp .max - datetime (1960 , 1 , 1 )).days
285- MIN_DAY_DELTA = (Timestamp .min - datetime (1960 , 1 , 1 )).days
286- MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
287- MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
288283
289- def convert_year_month_safe (year , month ) -> Series :
290- """
291- Convert year and month to datetimes, using pandas vectorized versions
292- when the date range falls within the range supported by pandas.
293- Otherwise it falls back to a slower but more robust method
294- using datetime.
295- """
296- if year .max () < MAX_YEAR and year .min () > MIN_YEAR :
297- return to_datetime (100 * year + month , format = "%Y%m" )
298- else :
299- index = getattr (year , "index" , None )
300- return Series ([datetime (y , m , 1 ) for y , m in zip (year , month )], index = index )
301-
302- def convert_year_days_safe (year , days ) -> Series :
303- """
304- Converts year (e.g. 1999) and days since the start of the year to a
305- datetime or datetime64 Series
306- """
307- if year .max () < (MAX_YEAR - 1 ) and year .min () > MIN_YEAR :
308- return to_datetime (year , format = "%Y" ) + to_timedelta (days , unit = "d" )
309- else :
310- index = getattr (year , "index" , None )
311- value = [
312- datetime (y , 1 , 1 ) + timedelta (days = int (d )) for y , d in zip (year , days )
313- ]
314- return Series (value , index = index )
284+ if fmt .startswith (("%tc" , "tc" )):
285+ # Delta ms relative to base
286+ td = np .timedelta64 (stata_epoch - unix_epoch , "ms" )
287+ res = np .array (dates ._values , dtype = "M8[ms]" ) + td
288+ return Series (res , index = dates .index )
315289
316- def convert_delta_safe (base , deltas , unit ) -> Series :
317- """
318- Convert base dates and deltas to datetimes, using pandas vectorized
319- versions if the deltas satisfy restrictions required to be expressed
320- as dates in pandas.
321- """
322- index = getattr (deltas , "index" , None )
323- if unit == "d" :
324- if deltas .max () > MAX_DAY_DELTA or deltas .min () < MIN_DAY_DELTA :
325- values = [base + timedelta (days = int (d )) for d in deltas ]
326- return Series (values , index = index )
327- elif unit == "ms" :
328- if deltas .max () > MAX_MS_DELTA or deltas .min () < MIN_MS_DELTA :
329- values = [
330- base + timedelta (microseconds = (int (d ) * 1000 )) for d in deltas
331- ]
332- return Series (values , index = index )
333- else :
334- raise ValueError ("format not understood" )
335- base = to_datetime (base )
336- deltas = to_timedelta (deltas , unit = unit )
337- return base + deltas
290+ elif fmt .startswith (("%td" , "td" , "%d" , "d" )):
291+ # Delta days relative to base
292+ td = np .timedelta64 (stata_epoch - unix_epoch , "D" )
293+ res = np .array (dates ._values , dtype = "M8[D]" ) + td
294+ return Series (res , index = dates .index )
295+
296+ elif fmt .startswith (("%tm" , "tm" )):
297+ # Delta months relative to base
298+ ordinals = dates + (stata_epoch .year - unix_epoch .year ) * 12
299+ res = np .array (ordinals , dtype = "M8[M]" ).astype ("M8[s]" )
300+ return Series (res , index = dates .index )
301+
302+ elif fmt .startswith (("%tq" , "tq" )):
303+ # Delta quarters relative to base
304+ ordinals = dates + (stata_epoch .year - unix_epoch .year ) * 4
305+ res = np .array (ordinals , dtype = "M8[3M]" ).astype ("M8[s]" )
306+ return Series (res , index = dates .index )
307+
308+ elif fmt .startswith (("%th" , "th" )):
309+ # Delta half-years relative to base
310+ ordinals = dates + (stata_epoch .year - unix_epoch .year ) * 2
311+ res = np .array (ordinals , dtype = "M8[6M]" ).astype ("M8[s]" )
312+ return Series (res , index = dates .index )
313+
314+ elif fmt .startswith (("%ty" , "ty" )):
315+ # Years -- not delta
316+ ordinals = dates - 1970
317+ res = np .array (ordinals , dtype = "M8[Y]" ).astype ("M8[s]" )
318+ return Series (res , index = dates .index )
338319
339- # TODO(non-nano): If/when pandas supports more than datetime64[ns], this
340- # should be improved to use correct range, e.g. datetime[Y] for yearly
341320 bad_locs = np .isnan (dates )
342321 has_bad_values = False
343322 if bad_locs .any ():
344323 has_bad_values = True
345324 dates ._values [bad_locs ] = 1.0 # Replace with NaT
346325 dates = dates .astype (np .int64 )
347326
348- if fmt .startswith (("%tc" , "tc" )): # Delta ms relative to base
349- base = stata_epoch
350- ms = dates
351- conv_dates = convert_delta_safe (base , ms , "ms" )
352- elif fmt .startswith (("%tC" , "tC" )):
327+ if fmt .startswith (("%tC" , "tC" )):
353328 warnings .warn (
354329 "Encountered %tC format. Leaving in Stata Internal Format." ,
355330 stacklevel = find_stack_level (),
@@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series:
358333 if has_bad_values :
359334 conv_dates [bad_locs ] = NaT
360335 return conv_dates
361- # Delta days relative to base
362- elif fmt .startswith (("%td" , "td" , "%d" , "d" )):
363- base = stata_epoch
364- days = dates
365- conv_dates = convert_delta_safe (base , days , "d" )
366336 # does not count leap days - 7 days is a week.
367337 # 52nd week may have more than 7 days
368338 elif fmt .startswith (("%tw" , "tw" )):
369339 year = stata_epoch .year + dates // 52
370340 days = (dates % 52 ) * 7
371- conv_dates = convert_year_days_safe (year , days )
372- elif fmt .startswith (("%tm" , "tm" )): # Delta months relative to base
373- year = stata_epoch .year + dates // 12
374- month = (dates % 12 ) + 1
375- conv_dates = convert_year_month_safe (year , month )
376- elif fmt .startswith (("%tq" , "tq" )): # Delta quarters relative to base
377- year = stata_epoch .year + dates // 4
378- quarter_month = (dates % 4 ) * 3 + 1
379- conv_dates = convert_year_month_safe (year , quarter_month )
380- elif fmt .startswith (("%th" , "th" )): # Delta half-years relative to base
381- year = stata_epoch .year + dates // 2
382- month = (dates % 2 ) * 6 + 1
383- conv_dates = convert_year_month_safe (year , month )
384- elif fmt .startswith (("%ty" , "ty" )): # Years -- not delta
385- year = dates
386- first_month = np .ones_like (dates )
387- conv_dates = convert_year_month_safe (year , first_month )
341+ per_y = (year - 1970 ).array .view ("Period[Y]" )
342+ per_d = per_y .asfreq ("D" , how = "S" )
343+ per_d_shifted = per_d + days ._values
344+ per_s = per_d_shifted .asfreq ("s" , how = "S" )
345+ conv_dates_arr = per_s .view ("M8[s]" )
346+ conv_dates = Series (conv_dates_arr , index = dates .index )
347+
388348 else :
389349 raise ValueError (f"Date fmt { fmt } not understood" )
390350
@@ -409,24 +369,26 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
409369 index = dates .index
410370 NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
411371 US_PER_DAY = NS_PER_DAY / 1000
372+ MS_PER_DAY = NS_PER_DAY / 1_000_000
412373
413374 def parse_dates_safe (
414375 dates : Series , delta : bool = False , year : bool = False , days : bool = False
415376 ):
416377 d = {}
417378 if lib .is_np_dtype (dates .dtype , "M" ):
418379 if delta :
419- time_delta = dates - Timestamp (stata_epoch ).as_unit ("ns" )
420- d ["delta" ] = time_delta ._values .view (np .int64 ) // 1000 # microseconds
380+ time_delta = dates .dt .as_unit ("ms" ) - Timestamp (stata_epoch ).as_unit (
381+ "ms"
382+ )
383+ d ["delta" ] = time_delta ._values .view (np .int64 )
421384 if days or year :
422385 date_index = DatetimeIndex (dates )
423386 d ["year" ] = date_index ._data .year
424387 d ["month" ] = date_index ._data .month
425388 if days :
426- days_in_ns = dates ._values .view (np .int64 ) - to_datetime (
427- d ["year" ], format = "%Y"
428- )._values .view (np .int64 )
429- d ["days" ] = days_in_ns // NS_PER_DAY
389+ year_start = np .asarray (dates ).astype ("M8[Y]" ).astype (dates .dtype )
390+ diff = dates - year_start
391+ d ["days" ] = np .asarray (diff ).astype ("m8[D]" ).view ("int64" )
430392
431393 elif infer_dtype (dates , skipna = False ) == "datetime" :
432394 if delta :
@@ -466,7 +428,7 @@ def g(x: datetime) -> int:
466428
467429 if fmt in ["%tc" , "tc" ]:
468430 d = parse_dates_safe (dates , delta = True )
469- conv_dates = d .delta / 1000
431+ conv_dates = d .delta
470432 elif fmt in ["%tC" , "tC" ]:
471433 warnings .warn (
472434 "Stata Internal Format tC not supported." ,
@@ -475,7 +437,7 @@ def g(x: datetime) -> int:
475437 conv_dates = dates
476438 elif fmt in ["%td" , "td" ]:
477439 d = parse_dates_safe (dates , delta = True )
478- conv_dates = d .delta // US_PER_DAY
440+ conv_dates = d .delta // MS_PER_DAY
479441 elif fmt in ["%tw" , "tw" ]:
480442 d = parse_dates_safe (dates , year = True , days = True )
481443 conv_dates = 52 * (d .year - stata_epoch .year ) + d .days // 7
0 commit comments