Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit f12c906

Browse files
authoredMay 30, 2024
feat: adds bigframes.bigquery.array_to_string to convert array elements to delimited strings (#731)
1 parent 9f0406e commit f12c906

File tree

5 files changed

+72
-3
lines changed

5 files changed

+72
-3
lines changed
 

‎bigframes/bigquery/__init__.py

+32-3
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ def array_length(series: series.Series) -> series.Series:
5757
dtype: Int64
5858
5959
Args:
60-
series (bigframes.series.Series):
61-
A Series with array columns.
60+
series (bigframes.series.Series): A Series with array columns.
6261
6362
Returns:
6463
bigframes.series.Series: A Series of integer values indicating
@@ -104,7 +103,7 @@ def array_agg(
104103
105104
Args:
106105
obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
107-
A GroupBy object to be applied the function.
106+
A GroupBy object to be applied the function.
108107
109108
Returns:
110109
bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
@@ -119,3 +118,33 @@ def array_agg(
119118
raise ValueError(
120119
f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
121120
)
121+
122+
123+
def array_to_string(series: series.Series, delimiter: str) -> series.Series:
124+
"""Converts array elements within a Series into delimited strings.
125+
126+
**Examples:**
127+
128+
>>> import bigframes.pandas as bpd
129+
>>> import bigframes.bigquery as bbq
130+
>>> import numpy as np
131+
>>> bpd.options.display.progress_bar = None
132+
133+
>>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
134+
>>> bbq.array_to_string(s, delimiter=", ")
135+
0 H, i, !
136+
1 Hello, World
137+
2
138+
3
139+
4 Hi
140+
dtype: string
141+
142+
Args:
143+
series (bigframes.series.Series): A Series containing arrays.
144+
delimiter (str): The string used to separate array elements.
145+
146+
Returns:
147+
bigframes.series.Series: A Series containing delimited strings.
148+
149+
"""
150+
return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))

‎bigframes/core/compile/scalar_op_compiler.py

+6
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,12 @@ def map_op_impl(x: ibis_types.Value, op: ops.MapOp):
885885
return case.else_(x).end()
886886

887887

888+
# Array Ops
889+
@scalar_op_compiler.register_unary_op(ops.ArrayToStringOp, pass_op=True)
890+
def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp):
891+
return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter)
892+
893+
888894
### Binary Ops
889895
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
890896
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""

‎bigframes/dtypes.py

+8
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,14 @@ def is_array_like(type: ExpressionType) -> bool:
134134
)
135135

136136

137+
def is_array_string_like(type: ExpressionType) -> bool:
138+
return (
139+
isinstance(type, pd.ArrowDtype)
140+
and isinstance(type.pyarrow_dtype, pa.ListType)
141+
and pa.types.is_string(type.pyarrow_dtype.value_type)
142+
)
143+
144+
137145
def is_struct_like(type: ExpressionType) -> bool:
138146
return isinstance(type, pd.ArrowDtype) and isinstance(
139147
type.pyarrow_dtype, pa.StructType

‎bigframes/operations/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,19 @@ def output_type(self, *input_types):
580580
return input_types[0]
581581

582582

583+
## Array Ops
584+
@dataclasses.dataclass(frozen=True)
585+
class ArrayToStringOp(UnaryOp):
586+
name: typing.ClassVar[str] = "array_to_string"
587+
delimiter: str
588+
589+
def output_type(self, *input_types):
590+
input_type = input_types[0]
591+
if not dtypes.is_array_string_like(input_type):
592+
raise TypeError("Input type must be an array of string type.")
593+
return dtypes.STRING_DTYPE
594+
595+
583596
# Binary Ops
584597
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
585598
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)

‎tests/system/small/bigquery/test_array.py

+13
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,16 @@ def test_array_agg_matches_after_explode():
139139
result.to_pandas(), # type: ignore
140140
df.to_pandas(),
141141
)
142+
143+
144+
@pytest.mark.parametrize(
145+
("data"),
146+
[
147+
pytest.param([[1, 2], [3, 4], [5]], id="int_array"),
148+
pytest.param(["hello", "world"], id="string"),
149+
],
150+
)
151+
def test_array_to_string_w_type_checks(data):
152+
series = bpd.Series(data)
153+
with pytest.raises(TypeError):
154+
bbq.array_to_string(series, delimiter=", ")

0 commit comments

Comments
 (0)
Failed to load comments.