Skip to content

Commit 0e10341

Browse files
uros-dbzhengruifeng
authored andcommitted
[SPARK-53930][PYTHON] Support TIME in the make_timestamp function in PySpark
### What changes were proposed in this pull request? Implement the support for TIME type in `make_timestamp` function in PySpark API. ### Why are the changes needed? Expand API support for the `MakeTimestamp` expression. ### Does this PR introduce _any_ user-facing change? Yes, the new function is now available in PySpark API. ### How was this patch tested? Added appropriate Python functions tests and examples. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52648 from uros-db/python-try_make_timestamp. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent a6d17f7 commit 0e10341

File tree

3 files changed

+479
-33
lines changed

3 files changed

+479
-33
lines changed

python/pyspark/sql/connect/functions/builtin.py

Lines changed: 82 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3945,23 +3945,98 @@ def make_time(hour: "ColumnOrName", minute: "ColumnOrName", second: "ColumnOrNam
39453945
make_time.__doc__ = pysparkfuncs.make_time.__doc__
39463946

39473947

3948+
@overload
39483949
def make_timestamp(
39493950
years: "ColumnOrName",
39503951
months: "ColumnOrName",
39513952
days: "ColumnOrName",
39523953
hours: "ColumnOrName",
39533954
mins: "ColumnOrName",
39543955
secs: "ColumnOrName",
3956+
) -> Column:
3957+
...
3958+
3959+
3960+
@overload
3961+
def make_timestamp(
3962+
years: "ColumnOrName",
3963+
months: "ColumnOrName",
3964+
days: "ColumnOrName",
3965+
hours: "ColumnOrName",
3966+
mins: "ColumnOrName",
3967+
secs: "ColumnOrName",
3968+
timezone: "ColumnOrName",
3969+
) -> Column:
3970+
...
3971+
3972+
3973+
@overload
3974+
def make_timestamp(*, date: "ColumnOrName", time: "ColumnOrName") -> Column:
3975+
...
3976+
3977+
3978+
@overload
3979+
def make_timestamp(
3980+
*, date: "ColumnOrName", time: "ColumnOrName", timezone: "ColumnOrName"
3981+
) -> Column:
3982+
...
3983+
3984+
3985+
def make_timestamp(
3986+
years: Optional["ColumnOrName"] = None,
3987+
months: Optional["ColumnOrName"] = None,
3988+
days: Optional["ColumnOrName"] = None,
3989+
hours: Optional["ColumnOrName"] = None,
3990+
mins: Optional["ColumnOrName"] = None,
3991+
secs: Optional["ColumnOrName"] = None,
39553992
timezone: Optional["ColumnOrName"] = None,
3993+
date: Optional["ColumnOrName"] = None,
3994+
time: Optional["ColumnOrName"] = None,
39563995
) -> Column:
3957-
if timezone is not None:
3958-
return _invoke_function_over_columns(
3959-
"make_timestamp", years, months, days, hours, mins, secs, timezone
3960-
)
3996+
if years is not None:
3997+
if any(arg is not None for arg in [date, time]):
3998+
raise PySparkValueError(
3999+
errorClass="CANNOT_SET_TOGETHER",
4000+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
4001+
)
4002+
if timezone is not None:
4003+
return _invoke_function_over_columns(
4004+
"make_timestamp",
4005+
cast("ColumnOrName", years),
4006+
cast("ColumnOrName", months),
4007+
cast("ColumnOrName", days),
4008+
cast("ColumnOrName", hours),
4009+
cast("ColumnOrName", mins),
4010+
cast("ColumnOrName", secs),
4011+
cast("ColumnOrName", timezone),
4012+
)
4013+
else:
4014+
return _invoke_function_over_columns(
4015+
"make_timestamp",
4016+
cast("ColumnOrName", years),
4017+
cast("ColumnOrName", months),
4018+
cast("ColumnOrName", days),
4019+
cast("ColumnOrName", hours),
4020+
cast("ColumnOrName", mins),
4021+
cast("ColumnOrName", secs),
4022+
)
39614023
else:
3962-
return _invoke_function_over_columns(
3963-
"make_timestamp", years, months, days, hours, mins, secs
3964-
)
4024+
if any(arg is not None for arg in [years, months, days, hours, mins, secs]):
4025+
raise PySparkValueError(
4026+
errorClass="CANNOT_SET_TOGETHER",
4027+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
4028+
)
4029+
if timezone is not None:
4030+
return _invoke_function_over_columns(
4031+
"make_timestamp",
4032+
cast("ColumnOrName", date),
4033+
cast("ColumnOrName", time),
4034+
cast("ColumnOrName", timezone),
4035+
)
4036+
else:
4037+
return _invoke_function_over_columns(
4038+
"make_timestamp", cast("ColumnOrName", date), cast("ColumnOrName", time)
4039+
)
39654040

39664041

39674042
make_timestamp.__doc__ = pysparkfuncs.make_timestamp.__doc__

python/pyspark/sql/functions/builtin.py

Lines changed: 158 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24790,43 +24790,106 @@ def make_time(hour: "ColumnOrName", minute: "ColumnOrName", second: "ColumnOrNam
2479024790
return _invoke_function_over_columns("make_time", hour, minute, second)
2479124791

2479224792

24793-
@_try_remote_functions
24793+
@overload
2479424794
def make_timestamp(
2479524795
years: "ColumnOrName",
2479624796
months: "ColumnOrName",
2479724797
days: "ColumnOrName",
2479824798
hours: "ColumnOrName",
2479924799
mins: "ColumnOrName",
2480024800
secs: "ColumnOrName",
24801+
) -> Column:
24802+
...
24803+
24804+
24805+
@overload
24806+
def make_timestamp(
24807+
years: "ColumnOrName",
24808+
months: "ColumnOrName",
24809+
days: "ColumnOrName",
24810+
hours: "ColumnOrName",
24811+
mins: "ColumnOrName",
24812+
secs: "ColumnOrName",
24813+
timezone: "ColumnOrName",
24814+
) -> Column:
24815+
...
24816+
24817+
24818+
@overload
24819+
def make_timestamp(*, date: "ColumnOrName", time: "ColumnOrName") -> Column:
24820+
...
24821+
24822+
24823+
@overload
24824+
def make_timestamp(
24825+
*, date: "ColumnOrName", time: "ColumnOrName", timezone: "ColumnOrName"
24826+
) -> Column:
24827+
...
24828+
24829+
24830+
@_try_remote_functions
24831+
def make_timestamp(
24832+
years: Optional["ColumnOrName"] = None,
24833+
months: Optional["ColumnOrName"] = None,
24834+
days: Optional["ColumnOrName"] = None,
24835+
hours: Optional["ColumnOrName"] = None,
24836+
mins: Optional["ColumnOrName"] = None,
24837+
secs: Optional["ColumnOrName"] = None,
2480124838
timezone: Optional["ColumnOrName"] = None,
24839+
date: Optional["ColumnOrName"] = None,
24840+
time: Optional["ColumnOrName"] = None,
2480224841
) -> Column:
2480324842
"""
24804-
Create timestamp from years, months, days, hours, mins, secs and timezone fields.
24843+
Create timestamp from years, months, days, hours, mins, secs, and (optional) timezone fields.
24844+
Alternatively, create timestamp from date, time, and (optional) timezone fields.
2480524845
The result data type is consistent with the value of configuration `spark.sql.timestampType`.
2480624846
If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL
2480724847
on invalid inputs. Otherwise, it will throw an error instead.
2480824848

2480924849
.. versionadded:: 3.5.0
2481024850

24851+
.. versionchanged:: 4.1.0
24852+
Added support for creating timestamps from date and time.
24853+
2481124854
Parameters
2481224855
----------
24813-
years : :class:`~pyspark.sql.Column` or column name
24814-
The year to represent, from 1 to 9999
24815-
months : :class:`~pyspark.sql.Column` or column name
24816-
The month-of-year to represent, from 1 (January) to 12 (December)
24817-
days : :class:`~pyspark.sql.Column` or column name
24818-
The day-of-month to represent, from 1 to 31
24819-
hours : :class:`~pyspark.sql.Column` or column name
24820-
The hour-of-day to represent, from 0 to 23
24821-
mins : :class:`~pyspark.sql.Column` or column name
24822-
The minute-of-hour to represent, from 0 to 59
24823-
secs : :class:`~pyspark.sql.Column` or column name
24856+
years : :class:`~pyspark.sql.Column` or column name, optional
24857+
The year to represent, from 1 to 9999.
24858+
Required when creating timestamps from individual components.
24859+
Must be used with months, days, hours, mins, and secs.
24860+
months : :class:`~pyspark.sql.Column` or column name, optional
24861+
The month-of-year to represent, from 1 (January) to 12 (December).
24862+
Required when creating timestamps from individual components.
24863+
Must be used with years, days, hours, mins, and secs.
24864+
days : :class:`~pyspark.sql.Column` or column name, optional
24865+
The day-of-month to represent, from 1 to 31.
24866+
Required when creating timestamps from individual components.
24867+
Must be used with years, months, hours, mins, and secs.
24868+
hours : :class:`~pyspark.sql.Column` or column name, optional
24869+
The hour-of-day to represent, from 0 to 23.
24870+
Required when creating timestamps from individual components.
24871+
Must be used with years, months, days, mins, and secs.
24872+
mins : :class:`~pyspark.sql.Column` or column name, optional
24873+
The minute-of-hour to represent, from 0 to 59.
24874+
Required when creating timestamps from individual components.
24875+
Must be used with years, months, days, hours, and secs.
24876+
secs : :class:`~pyspark.sql.Column` or column name, optional
2482424877
The second-of-minute and its micro-fraction to represent, from 0 to 60.
24825-
The value can be either an integer like 13 , or a fraction like 13.123.
24878+
The value can be either an integer like 13, or a fraction like 13.123.
2482624879
If the sec argument equals to 60, the seconds field is set
2482724880
to 0 and 1 minute is added to the final timestamp.
24881+
Required when creating timestamps from individual components.
24882+
Must be used with years, months, days, hours, and mins.
2482824883
timezone : :class:`~pyspark.sql.Column` or column name, optional
24829-
The time zone identifier. For example, CET, UTC and etc.
24884+
The time zone identifier. For example, CET, UTC, and etc.
24885+
date : :class:`~pyspark.sql.Column` or column name, optional
24886+
The date to represent, in valid DATE format.
24887+
Required when creating timestamps from date and time components.
24888+
Must be used with time parameter only.
24889+
time : :class:`~pyspark.sql.Column` or column name, optional
24890+
The time to represent, in valid TIME format.
24891+
Required when creating timestamps from date and time components.
24892+
Must be used with date parameter only.
2483024893

2483124894
Returns
2483224895
-------
@@ -24848,7 +24911,7 @@ def make_timestamp(
2484824911
--------
2484924912
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
2485024913

24851-
Example 1: Make timestamp from years, months, days, hours, mins and secs.
24914+
Example 1: Make timestamp from years, months, days, hours, mins, secs, and timezone.
2485224915

2485324916
>>> import pyspark.sql.functions as sf
2485424917
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
@@ -24862,11 +24925,11 @@ def make_timestamp(
2486224925
|2014-12-27 21:30:45.887 |
2486324926
+----------------------------------------------------+
2486424927

24865-
Example 2: Make timestamp without timezone.
24928+
Example 2: Make timestamp from years, months, days, hours, mins, and secs (without timezone).
2486624929

2486724930
>>> import pyspark.sql.functions as sf
24868-
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],
24869-
... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz'])
24931+
>>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]],
24932+
... ['year', 'month', 'day', 'hour', 'min', 'sec'])
2487024933
>>> df.select(
2487124934
... sf.make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec)
2487224935
... ).show(truncate=False)
@@ -24876,16 +24939,85 @@ def make_timestamp(
2487624939
|2014-12-28 06:30:45.887 |
2487724940
+------------------------------------------------+
2487824941

24942+
Example 3: Make timestamp from date, time, and timezone.
24943+
24944+
>>> import pyspark.sql.functions as sf
24945+
>>> from datetime import date, time
24946+
>>> df = spark.range(1).select(
24947+
... sf.lit(date(2014, 12, 28)).alias("date"),
24948+
... sf.lit(time(6, 30, 45, 887000)).alias("time"),
24949+
... sf.lit("CET").alias("tz")
24950+
... )
24951+
>>> df.select(
24952+
... sf.make_timestamp(date=df.date, time=df.time, timezone=df.tz)
24953+
... ).show(truncate=False)
24954+
+------------------------------+
24955+
|make_timestamp(date, time, tz)|
24956+
+------------------------------+
24957+
|2014-12-27 21:30:45.887 |
24958+
+------------------------------+
24959+
24960+
Example 4: Make timestamp from date and time (without timezone).
24961+
24962+
>>> import pyspark.sql.functions as sf
24963+
>>> from datetime import date, time
24964+
>>> df = spark.range(1).select(
24965+
... sf.lit(date(2014, 12, 28)).alias("date"),
24966+
... sf.lit(time(6, 30, 45, 887000)).alias("time")
24967+
... )
24968+
>>> df.select(sf.make_timestamp(date=df.date, time=df.time)).show(truncate=False)
24969+
+--------------------------+
24970+
|make_timestamp(date, time)|
24971+
+--------------------------+
24972+
|2014-12-28 06:30:45.887 |
24973+
+--------------------------+
24974+
2487924975
>>> spark.conf.unset("spark.sql.session.timeZone")
2488024976
"""
24881-
if timezone is not None:
24882-
return _invoke_function_over_columns(
24883-
"make_timestamp", years, months, days, hours, mins, secs, timezone
24884-
)
24977+
if years is not None:
24978+
if any(arg is not None for arg in [date, time]):
24979+
raise PySparkValueError(
24980+
errorClass="CANNOT_SET_TOGETHER",
24981+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
24982+
)
24983+
if timezone is not None:
24984+
return _invoke_function_over_columns(
24985+
"make_timestamp",
24986+
cast("ColumnOrName", years),
24987+
cast("ColumnOrName", months),
24988+
cast("ColumnOrName", days),
24989+
cast("ColumnOrName", hours),
24990+
cast("ColumnOrName", mins),
24991+
cast("ColumnOrName", secs),
24992+
cast("ColumnOrName", timezone),
24993+
)
24994+
else:
24995+
return _invoke_function_over_columns(
24996+
"make_timestamp",
24997+
cast("ColumnOrName", years),
24998+
cast("ColumnOrName", months),
24999+
cast("ColumnOrName", days),
25000+
cast("ColumnOrName", hours),
25001+
cast("ColumnOrName", mins),
25002+
cast("ColumnOrName", secs),
25003+
)
2488525004
else:
24886-
return _invoke_function_over_columns(
24887-
"make_timestamp", years, months, days, hours, mins, secs
24888-
)
25005+
if any(arg is not None for arg in [years, months, days, hours, mins, secs]):
25006+
raise PySparkValueError(
25007+
errorClass="CANNOT_SET_TOGETHER",
25008+
messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"},
25009+
)
25010+
if timezone is not None:
25011+
return _invoke_function_over_columns(
25012+
"make_timestamp",
25013+
cast("ColumnOrName", date),
25014+
cast("ColumnOrName", time),
25015+
cast("ColumnOrName", timezone),
25016+
)
25017+
else:
25018+
return _invoke_function_over_columns(
25019+
"make_timestamp", cast("ColumnOrName", date), cast("ColumnOrName", time)
25020+
)
2488925021

2489025022

2489125023
@overload

0 commit comments

Comments
 (0)