Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit adf9889

Browse files
junyazhanggcf-owl-bot[bot]TrevorBergeron
authoredMay 14, 2024
feat: to_datetime supports utc=False for string inputs (#579)
* feat: to_datetime supports utc=False for string inputs * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add unit tests for timestamp inputs * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * simplify conversion ops * address comments * fix failed presubmit test --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Trevor Bergeron <tbergeron@google.com>
1 parent cb36e46 commit adf9889

File tree

6 files changed

+217
-28
lines changed

6 files changed

+217
-28
lines changed
 

‎bigframes/core/compile/scalar_op_compiler.py

+28-11
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import functools
1818
import typing
1919

20+
import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
2021
import ibis
2122
import ibis.common.exceptions
2223
import ibis.expr.datatypes as ibis_dtypes
@@ -737,7 +738,7 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp):
737738
return struct_value[name].name(name)
738739

739740

740-
def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
741+
def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
741742
if not isinstance(x, ibis_types.IntegerValue) and not isinstance(
742743
x, ibis_types.FloatingValue
743744
):
@@ -779,7 +780,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
779780
# with pandas converting int64[pyarrow] to timestamp[us][pyarrow],
780781
# timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow].
781782
unit = "us"
782-
x_converted = numeric_to_datatime(x, unit)
783+
x_converted = numeric_to_datetime(x, unit)
783784
if to_type == ibis_dtypes.timestamp:
784785
return x_converted.cast(ibis_dtypes.Timestamp())
785786
elif to_type == ibis_dtypes.Timestamp(timezone="UTC"):
@@ -818,23 +819,39 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp):
818819
@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True)
819820
def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp):
820821
if x.type() == ibis_dtypes.str:
821-
x = x.to_timestamp(op.format) if op.format else timestamp(x)
822-
elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"):
822+
return vendored_ibis_ops.SafeCastToDatetime(x).to_expr()
823+
else:
824+
# Numerical inputs.
823825
if op.format:
824-
raise NotImplementedError(
825-
f"Format parameter is not supported for Timestamp input types. {constants.FEEDBACK_LINK}"
826-
)
827-
return x
828-
elif x.type() != ibis_dtypes.timestamp:
826+
x = x.cast(ibis_dtypes.str).to_timestamp(op.format)
827+
else:
828+
# The default unit is set to "ns" (nanoseconds) for consistency
829+
# with pandas, where "ns" is the default unit for datetime operations.
830+
unit = op.unit or "ns"
831+
x = numeric_to_datetime(x, unit)
832+
833+
return x.cast(ibis_dtypes.Timestamp(None))
834+
835+
836+
@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True)
837+
def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp):
838+
if x.type() == ibis_dtypes.str:
839+
x = (
840+
typing.cast(ibis_types.StringValue, x).to_timestamp(op.format)
841+
if op.format
842+
else timestamp(x)
843+
)
844+
else:
845+
# Numerical inputs.
829846
if op.format:
830847
x = x.cast(ibis_dtypes.str).to_timestamp(op.format)
831848
else:
832849
# The default unit is set to "ns" (nanoseconds) for consistency
833850
# with pandas, where "ns" is the default unit for datetime operations.
834851
unit = op.unit or "ns"
835-
x = numeric_to_datatime(x, unit)
852+
x = numeric_to_datetime(x, unit)
836853

837-
return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None))
854+
return x.cast(ibis_dtypes.Timestamp(timezone="UTC"))
838855

839856

840857
@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)

‎bigframes/core/tools/datetimes.py

+58-14
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import bigframes.constants as constants
2323
import bigframes.dataframe
24+
import bigframes.dtypes
2425
import bigframes.operations as ops
2526
import bigframes.series
2627

@@ -51,25 +52,68 @@ def to_datetime(
5152
f"to datetime is not implemented. {constants.FEEDBACK_LINK}"
5253
)
5354

54-
arg = bigframes.series.Series(arg)
55+
arg = bigframes.series.Series(arg)._cached()
5556

56-
if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore
57-
raise NotImplementedError(
58-
f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}"
59-
)
60-
61-
if format and unit and arg.dtype in ("Int64", "Float64"): # type: ignore
57+
if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore
6258
raise ValueError("cannot specify both format and unit")
6359

64-
if unit and arg.dtype not in ("Int64", "Float64"): # type: ignore
60+
if unit and arg.dtype not in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore
6561
raise NotImplementedError(
6662
f"Unit parameter is not supported for non-numerical input types. {constants.FEEDBACK_LINK}"
6763
)
6864

69-
return arg._apply_unary_op( # type: ignore
70-
ops.ToDatetimeOp(
71-
utc=utc,
72-
format=format,
73-
unit=unit,
65+
if arg.dtype in (bigframes.dtypes.TIMESTAMP_DTYPE, bigframes.dtypes.DATETIME_DTYPE):
66+
to_type = (
67+
bigframes.dtypes.TIMESTAMP_DTYPE if utc else bigframes.dtypes.DATETIME_DTYPE
68+
)
69+
return arg._apply_unary_op(ops.AsTypeOp(to_type=to_type)) # type: ignore
70+
if (not utc) and arg.dtype == bigframes.dtypes.STRING_DTYPE:
71+
if format:
72+
raise NotImplementedError(
73+
f"Customized formats are not supported for string inputs when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}"
74+
)
75+
76+
assert unit is None
77+
as_datetime = arg._apply_unary_op( # type: ignore
78+
ops.ToDatetimeOp(
79+
format=format,
80+
unit=unit,
81+
)
82+
)
83+
failed_datetime_cast = arg.notnull() & as_datetime.isnull()
84+
is_utc = arg._apply_unary_op(
85+
ops.EndsWithOp(
86+
pat=("Z", "-00:00", "+00:00", "-0000", "+0000", "-00", "+00")
87+
)
88+
)
89+
90+
# Cast to DATETIME shall succeed if all inputs are tz-naive.
91+
if not failed_datetime_cast.any():
92+
return as_datetime
93+
94+
if is_utc.all():
95+
return arg._apply_unary_op( # type: ignore
96+
ops.ToTimestampOp(
97+
format=format,
98+
unit=unit,
99+
)
100+
)
101+
102+
raise NotImplementedError(
103+
f"Non-UTC string inputs are not supported when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}"
104+
)
105+
# If utc:
106+
elif utc:
107+
return arg._apply_unary_op( # type: ignore
108+
ops.ToTimestampOp(
109+
format=format,
110+
unit=unit,
111+
)
112+
)
113+
else:
114+
return arg._apply_unary_op( # type: ignore
115+
ops.ToDatetimeOp(
116+
format=format,
117+
unit=unit,
118+
)
74119
)
75-
)

‎bigframes/operations/__init__.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pandas as pd
2424
import pyarrow as pa
2525

26+
import bigframes.dtypes
2627
import bigframes.dtypes as dtypes
2728
import bigframes.operations.type as op_typing
2829

@@ -527,13 +528,34 @@ def output_type(self, *input_types):
527528
@dataclasses.dataclass(frozen=True)
528529
class ToDatetimeOp(UnaryOp):
529530
name: typing.ClassVar[str] = "to_datetime"
530-
utc: bool = False
531531
format: typing.Optional[str] = None
532532
unit: typing.Optional[str] = None
533533

534534
def output_type(self, *input_types):
535-
timezone = "UTC" if self.utc else None
536-
return pd.ArrowDtype(pa.timestamp("us", tz=timezone))
535+
if input_types[0] not in (
536+
bigframes.dtypes.FLOAT_DTYPE,
537+
bigframes.dtypes.INT_DTYPE,
538+
bigframes.dtypes.STRING_DTYPE,
539+
):
540+
raise TypeError("expected string or numeric input")
541+
return pd.ArrowDtype(pa.timestamp("us", tz=None))
542+
543+
544+
@dataclasses.dataclass(frozen=True)
545+
class ToTimestampOp(UnaryOp):
546+
name: typing.ClassVar[str] = "to_timestamp"
547+
format: typing.Optional[str] = None
548+
unit: typing.Optional[str] = None
549+
550+
def output_type(self, *input_types):
551+
# Must be numeric or string
552+
if input_types[0] not in (
553+
bigframes.dtypes.FLOAT_DTYPE,
554+
bigframes.dtypes.INT_DTYPE,
555+
bigframes.dtypes.STRING_DTYPE,
556+
):
557+
raise TypeError("expected string or numeric input")
558+
return pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
537559

538560

539561
@dataclasses.dataclass(frozen=True)

‎tests/system/small/test_pandas.py

+96
Original file line numberDiff line numberDiff line change
@@ -634,3 +634,99 @@ def test_to_datetime_format_param(arg, utc, format):
634634
pd.testing.assert_series_equal(
635635
bf_result, pd_result, check_index_type=False, check_names=False
636636
)
637+
638+
639+
@pytest.mark.parametrize(
640+
("arg", "utc", "output_in_utc", "format"),
641+
[
642+
(
643+
["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"],
644+
False,
645+
False,
646+
None,
647+
),
648+
(
649+
[
650+
"2008-12-25 05:30:00Z",
651+
"2008-12-25 05:30:00-00:00",
652+
"2008-12-25 05:30:00+00:00",
653+
"2008-12-25 05:30:00-0000",
654+
"2008-12-25 05:30:00+0000",
655+
"2008-12-25 05:30:00-00",
656+
"2008-12-25 05:30:00+00",
657+
],
658+
False,
659+
True,
660+
None,
661+
),
662+
(
663+
["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"],
664+
True,
665+
True,
666+
"%Y-%m-%d %H:%M:%S",
667+
),
668+
(
669+
[
670+
"2014-08-15 08:15:12+05:00",
671+
"2011-08-15 08:15:12+05:00",
672+
"2015-08-15 08:15:12+05:00",
673+
],
674+
True,
675+
True,
676+
None,
677+
),
678+
],
679+
)
680+
def test_to_datetime_string_inputs(arg, utc, output_in_utc, format):
681+
bf_result = (
682+
bpd.to_datetime(arg, utc=utc, format=format)
683+
.to_pandas()
684+
.astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]")
685+
)
686+
pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us")
687+
pd.testing.assert_series_equal(
688+
bf_result, pd_result, check_index_type=False, check_names=False
689+
)
690+
691+
692+
@pytest.mark.parametrize(
693+
("arg", "utc", "output_in_utc"),
694+
[
695+
(
696+
[datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)],
697+
False,
698+
False,
699+
),
700+
(
701+
[datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)],
702+
True,
703+
True,
704+
),
705+
(
706+
[
707+
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
708+
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
709+
],
710+
True,
711+
True,
712+
),
713+
(
714+
[
715+
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York")),
716+
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
717+
],
718+
True,
719+
True,
720+
),
721+
],
722+
)
723+
def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc):
724+
bf_result = (
725+
bpd.to_datetime(arg, utc=utc)
726+
.to_pandas()
727+
.astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]")
728+
)
729+
pd_result = pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us")
730+
pd.testing.assert_series_equal(
731+
bf_result, pd_result, check_index_type=False, check_names=False
732+
)

‎third_party/bigframes_vendored/ibis/backends/bigquery/registry.py

+6
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray):
3232
return f"GENERATE_ARRAY(0, {arg})"
3333

3434

35+
def _safe_cast_to_datetime(translator, op: vendored_ibis_ops.SafeCastToDatetime):
36+
arg = translator.translate(op.arg)
37+
return f"SAFE_CAST({arg} AS DATETIME)"
38+
39+
3540
def _quantile(translator, op: ibis_reductions.Quantile):
3641
arg = translator.translate(op.arg)
3742
quantile = translator.translate(op.quantile)
@@ -44,6 +49,7 @@ def _quantile(translator, op: ibis_reductions.Quantile):
4449
vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore
4550
vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore
4651
vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore
52+
vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore
4753
ibis_reductions.Quantile: _quantile, # type:ignore
4854
}
4955

‎third_party/bigframes_vendored/ibis/expr/operations/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,7 @@
77

88
class GenerateArray(Unary):
99
dtype = dt.Array(dt.int64)
10+
11+
12+
class SafeCastToDatetime(Unary):
13+
dtype = dt.Timestamp(timezone=None)

0 commit comments

Comments
 (0)
Failed to load comments.