Skip to content

Commit a6d17f7

Browse files
uros-dbzhengruifeng
authored andcommitted
[SPARK-53956][PYTHON] Support TIME in the try_make_timestamp function in PySpark
### What changes were proposed in this pull request? Implement the support for TIME type in `try_make_timestamp` function in PySpark API. ### Why are the changes needed? Expand API support for the `TryMakeTimestamp` expression. ### Does this PR introduce _any_ user-facing change? Yes, the new function is now available in PySpark API. ### How was this patch tested? Added appropriate Python functions tests and examples. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52666 from uros-db/python-try-make_timestamp. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 0be5f96 commit a6d17f7

File tree

3 files changed

+473
-39
lines changed

3 files changed

+473
-39
lines changed

python/pyspark/sql/connect/functions/builtin.py

Lines changed: 82 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3967,23 +3967,98 @@ def make_timestamp(
39673967
make_timestamp.__doc__ = pysparkfuncs.make_timestamp.__doc__
39683968

39693969

3970+
@overload
39703971
def try_make_timestamp(
39713972
years: "ColumnOrName",
39723973
months: "ColumnOrName",
39733974
days: "ColumnOrName",
39743975
hours: "ColumnOrName",
39753976
mins: "ColumnOrName",
39763977
secs: "ColumnOrName",
3978+
) -> Column:
3979+
...
3980+
3981+
3982+
@overload
3983+
def try_make_timestamp(
3984+
years: "ColumnOrName",
3985+
months: "ColumnOrName",
3986+
days: "ColumnOrName",
3987+
hours: "ColumnOrName",
3988+
mins: "ColumnOrName",
3989+
secs: "ColumnOrName",
3990+
timezone: "ColumnOrName",
3991+
) -> Column:
3992+
...
3993+
3994+
3995+
@overload
3996+
def try_make_timestamp(*, date: "ColumnOrName", time: "ColumnOrName") -> Column:
3997+
...
3998+
3999+
4000+
@overload
4001+
def try_make_timestamp(
4002+
*, date: "ColumnOrName", time: "ColumnOrName", timezone: "ColumnOrName"
4003+
) -> Column:
4004+
...
4005+
4006+
4007+
def try_make_timestamp(
4008+
years: Optional["ColumnOrName"] = None,
4009+
months: Optional["ColumnOrName"] = None,
4010+
days: Optional["ColumnOrName"] = None,
4011+
hours: Optional["ColumnOrName"] = None,
4012+
mins: Optional["ColumnOrName"] = None,
4013+
secs: Optional["ColumnOrName"] = None,
39774014
timezone: Optional["ColumnOrName"] = None,
4015+
date: Optional["ColumnOrName"] = None,
4016+
time: Optional["ColumnOrName"] = None,
39784017
) -> Column:
3979-
if timezone is not None:
3980-
return _invoke_function_over_columns(
3981-
"try_make_timestamp", years, months, days, hours, mins, secs, timezone
3982-
)
4018+
if years is not None:
4019+
if any(arg is not None for arg in [date, time]):
4020+
raise PySparkValueError(
4021+
errorClass="CANNOT_SET_TOGETHER",
4022+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
4023+
)
4024+
if timezone is not None:
4025+
return _invoke_function_over_columns(
4026+
"try_make_timestamp",
4027+
cast("ColumnOrName", years),
4028+
cast("ColumnOrName", months),
4029+
cast("ColumnOrName", days),
4030+
cast("ColumnOrName", hours),
4031+
cast("ColumnOrName", mins),
4032+
cast("ColumnOrName", secs),
4033+
cast("ColumnOrName", timezone),
4034+
)
4035+
else:
4036+
return _invoke_function_over_columns(
4037+
"try_make_timestamp",
4038+
cast("ColumnOrName", years),
4039+
cast("ColumnOrName", months),
4040+
cast("ColumnOrName", days),
4041+
cast("ColumnOrName", hours),
4042+
cast("ColumnOrName", mins),
4043+
cast("ColumnOrName", secs),
4044+
)
39834045
else:
3984-
return _invoke_function_over_columns(
3985-
"try_make_timestamp", years, months, days, hours, mins, secs
3986-
)
4046+
if any(arg is not None for arg in [years, months, days, hours, mins, secs]):
4047+
raise PySparkValueError(
4048+
errorClass="CANNOT_SET_TOGETHER",
4049+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
4050+
)
4051+
if timezone is not None:
4052+
return _invoke_function_over_columns(
4053+
"try_make_timestamp",
4054+
cast("ColumnOrName", date),
4055+
cast("ColumnOrName", time),
4056+
cast("ColumnOrName", timezone),
4057+
)
4058+
else:
4059+
return _invoke_function_over_columns(
4060+
"try_make_timestamp", cast("ColumnOrName", date), cast("ColumnOrName", time)
4061+
)
39874062

39884063

39894064
try_make_timestamp.__doc__ = pysparkfuncs.try_make_timestamp.__doc__

python/pyspark/sql/functions/builtin.py

Lines changed: 156 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24888,42 +24888,105 @@ def make_timestamp(
2488824888
)
2488924889

2489024890

24891-
@_try_remote_functions
24891+
@overload
2489224892
def try_make_timestamp(
2489324893
years: "ColumnOrName",
2489424894
months: "ColumnOrName",
2489524895
days: "ColumnOrName",
2489624896
hours: "ColumnOrName",
2489724897
mins: "ColumnOrName",
2489824898
secs: "ColumnOrName",
24899+
) -> Column:
24900+
...
24901+
24902+
24903+
@overload
24904+
def try_make_timestamp(
24905+
years: "ColumnOrName",
24906+
months: "ColumnOrName",
24907+
days: "ColumnOrName",
24908+
hours: "ColumnOrName",
24909+
mins: "ColumnOrName",
24910+
secs: "ColumnOrName",
24911+
timezone: "ColumnOrName",
24912+
) -> Column:
24913+
...
24914+
24915+
24916+
@overload
24917+
def try_make_timestamp(*, date: "ColumnOrName", time: "ColumnOrName") -> Column:
24918+
...
24919+
24920+
24921+
@overload
24922+
def try_make_timestamp(
24923+
*, date: "ColumnOrName", time: "ColumnOrName", timezone: "ColumnOrName"
24924+
) -> Column:
24925+
...
24926+
24927+
24928+
@_try_remote_functions
24929+
def try_make_timestamp(
24930+
years: Optional["ColumnOrName"] = None,
24931+
months: Optional["ColumnOrName"] = None,
24932+
days: Optional["ColumnOrName"] = None,
24933+
hours: Optional["ColumnOrName"] = None,
24934+
mins: Optional["ColumnOrName"] = None,
24935+
secs: Optional["ColumnOrName"] = None,
2489924936
timezone: Optional["ColumnOrName"] = None,
24937+
date: Optional["ColumnOrName"] = None,
24938+
time: Optional["ColumnOrName"] = None,
2490024939
) -> Column:
2490124940
"""
24902-
Try to create timestamp from years, months, days, hours, mins, secs and timezone fields.
24941+
Try to create timestamp from years, months, days, hours, mins, secs and (optional) timezone
24942+
fields. Alternatively, try to create timestamp from date, time, and (optional) timezone fields.
2490324943
The result data type is consistent with the value of configuration `spark.sql.timestampType`.
2490424944
The function returns NULL on invalid inputs.
2490524945

2490624946
.. versionadded:: 4.0.0
2490724947

24948+
.. versionchanged:: 4.1.0
24949+
Added support for creating timestamps from date and time.
24950+
2490824951
Parameters
2490924952
----------
24910-
years : :class:`~pyspark.sql.Column` or column name
24911-
The year to represent, from 1 to 9999
24912-
months : :class:`~pyspark.sql.Column` or column name
24913-
The month-of-year to represent, from 1 (January) to 12 (December)
24914-
days : :class:`~pyspark.sql.Column` or column name
24915-
The day-of-month to represent, from 1 to 31
24916-
hours : :class:`~pyspark.sql.Column` or column name
24917-
The hour-of-day to represent, from 0 to 23
24918-
mins : :class:`~pyspark.sql.Column` or column name
24919-
The minute-of-hour to represent, from 0 to 59
24920-
secs : :class:`~pyspark.sql.Column` or column name
24953+
years : :class:`~pyspark.sql.Column` or column name, optional
24954+
The year to represent, from 1 to 9999.
24955+
Required when creating timestamps from individual components.
24956+
Must be used with months, days, hours, mins, and secs.
24957+
months : :class:`~pyspark.sql.Column` or column name, optional
24958+
The month-of-year to represent, from 1 (January) to 12 (December).
24959+
Required when creating timestamps from individual components.
24960+
Must be used with years, days, hours, mins, and secs.
24961+
days : :class:`~pyspark.sql.Column` or column name, optional
24962+
The day-of-month to represent, from 1 to 31.
24963+
Required when creating timestamps from individual components.
24964+
Must be used with years, months, hours, mins, and secs.
24965+
hours : :class:`~pyspark.sql.Column` or column name, optional
24966+
The hour-of-day to represent, from 0 to 23.
24967+
Required when creating timestamps from individual components.
24968+
Must be used with years, months, days, mins, and secs.
24969+
mins : :class:`~pyspark.sql.Column` or column name, optional
24970+
The minute-of-hour to represent, from 0 to 59.
24971+
Required when creating timestamps from individual components.
24972+
Must be used with years, months, days, hours, and secs.
24973+
secs : :class:`~pyspark.sql.Column` or column name, optional
2492124974
The second-of-minute and its micro-fraction to represent, from 0 to 60.
24922-
The value can be either an integer like 13 , or a fraction like 13.123.
24975+
The value can be either an integer like 13, or a fraction like 13.123.
2492324976
If the sec argument equals to 60, the seconds field is set
2492424977
to 0 and 1 minute is added to the final timestamp.
24978+
Required when creating timestamps from individual components.
24979+
Must be used with years, months, days, hours, and mins.
2492524980
timezone : :class:`~pyspark.sql.Column` or column name, optional
24926-
The time zone identifier. For example, CET, UTC and etc.
24981+
The time zone identifier. For example, CET, UTC, and etc.
24982+
date : :class:`~pyspark.sql.Column` or column name, optional
24983+
The date to represent, in valid DATE format.
24984+
Required when creating timestamps from date and time components.
24985+
Must be used with time parameter only.
24986+
time : :class:`~pyspark.sql.Column` or column name, optional
24987+
The time to represent, in valid TIME format.
24988+
Required when creating timestamps from date and time components.
24989+
Must be used with date parameter only.
2492724990

2492824991
Returns
2492924992
-------
@@ -24945,7 +25008,7 @@ def try_make_timestamp(
2494525008
--------
2494625009
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
2494725010

24948-
Example 1: Make timestamp from years, months, days, hours, mins and secs.
25011+
Example 1: Make timestamp from years, months, days, hours, mins, secs, and timezone.
2494925012

2495025013
>>> import pyspark.sql.functions as sf
2495125014
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
@@ -24959,7 +25022,7 @@ def try_make_timestamp(
2495925022
|2014-12-27 21:30:45.887 |
2496025023
+----------------------------------------------------+
2496125024

24962-
Example 2: Make timestamp without timezone.
25025+
Example 2: Make timestamp from years, months, days, hours, mins, and secs (without timezone).
2496325026

2496425027
>>> import pyspark.sql.functions as sf
2496525028
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
@@ -24972,7 +25035,6 @@ def try_make_timestamp(
2497225035
+----------------------------------------------------+
2497325036
|2014-12-28 06:30:45.887 |
2497425037
+----------------------------------------------------+
24975-
>>> spark.conf.unset("spark.sql.session.timeZone")
2497625038

2497725039
Example 3: Make timestamp with invalid input.
2497825040

@@ -24988,16 +25050,85 @@ def try_make_timestamp(
2498825050
|NULL |
2498925051
+----------------------------------------------------+
2499025052

25053+
Example 4: Make timestamp from date, time, and timezone.
25054+
25055+
>>> import pyspark.sql.functions as sf
25056+
>>> from datetime import date, time
25057+
>>> df = spark.range(1).select(
25058+
... sf.lit(date(2014, 12, 28)).alias("date"),
25059+
... sf.lit(time(6, 30, 45, 887000)).alias("time"),
25060+
... sf.lit("CET").alias("tz")
25061+
... )
25062+
>>> df.select(
25063+
... sf.try_make_timestamp(date=df.date, time=df.time, timezone=df.tz)
25064+
... ).show(truncate=False)
25065+
+----------------------------------+
25066+
|try_make_timestamp(date, time, tz)|
25067+
+----------------------------------+
25068+
|2014-12-27 21:30:45.887 |
25069+
+----------------------------------+
25070+
25071+
Example 5: Make timestamp from date and time (without timezone).
25072+
25073+
>>> import pyspark.sql.functions as sf
25074+
>>> from datetime import date, time
25075+
>>> df = spark.range(1).select(
25076+
... sf.lit(date(2014, 12, 28)).alias("date"),
25077+
... sf.lit(time(6, 30, 45, 887000)).alias("time")
25078+
... )
25079+
>>> df.select(sf.try_make_timestamp(date=df.date, time=df.time)).show(truncate=False)
25080+
+------------------------------+
25081+
|try_make_timestamp(date, time)|
25082+
+------------------------------+
25083+
|2014-12-28 06:30:45.887 |
25084+
+------------------------------+
25085+
2499125086
>>> spark.conf.unset("spark.sql.session.timeZone")
2499225087
"""
24993-
if timezone is not None:
24994-
return _invoke_function_over_columns(
24995-
"try_make_timestamp", years, months, days, hours, mins, secs, timezone
24996-
)
25088+
if years is not None:
25089+
if any(arg is not None for arg in [date, time]):
25090+
raise PySparkValueError(
25091+
errorClass="CANNOT_SET_TOGETHER",
25092+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
25093+
)
25094+
if timezone is not None:
25095+
return _invoke_function_over_columns(
25096+
"try_make_timestamp",
25097+
cast("ColumnOrName", years),
25098+
cast("ColumnOrName", months),
25099+
cast("ColumnOrName", days),
25100+
cast("ColumnOrName", hours),
25101+
cast("ColumnOrName", mins),
25102+
cast("ColumnOrName", secs),
25103+
cast("ColumnOrName", timezone),
25104+
)
25105+
else:
25106+
return _invoke_function_over_columns(
25107+
"try_make_timestamp",
25108+
cast("ColumnOrName", years),
25109+
cast("ColumnOrName", months),
25110+
cast("ColumnOrName", days),
25111+
cast("ColumnOrName", hours),
25112+
cast("ColumnOrName", mins),
25113+
cast("ColumnOrName", secs),
25114+
)
2499725115
else:
24998-
return _invoke_function_over_columns(
24999-
"try_make_timestamp", years, months, days, hours, mins, secs
25000-
)
25116+
if any(arg is not None for arg in [years, months, days, hours, mins, secs]):
25117+
raise PySparkValueError(
25118+
errorClass="CANNOT_SET_TOGETHER",
25119+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
25120+
)
25121+
if timezone is not None:
25122+
return _invoke_function_over_columns(
25123+
"try_make_timestamp",
25124+
cast("ColumnOrName", date),
25125+
cast("ColumnOrName", time),
25126+
cast("ColumnOrName", timezone),
25127+
)
25128+
else:
25129+
return _invoke_function_over_columns(
25130+
"try_make_timestamp", cast("ColumnOrName", date), cast("ColumnOrName", time)
25131+
)
2500125132

2500225133

2500325134
@_try_remote_functions

0 commit comments

Comments
 (0)