Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 575a29e

Browse files
authoredAug 23, 2024
feat: implement bigframes.bigquery.json_extract_array (#910)
* feat: implement `bigframes.bigquery.json_extract_array` This id needed to implement support for array return types in remote functions. * actually return, make tests pass * add negative test case
1 parent e837f6e commit 575a29e

File tree

4 files changed

+86
-0
lines changed

4 files changed

+86
-0
lines changed
 

‎bigframes/bigquery/__init__.py

+32
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,38 @@ def json_extract(
239239
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
240240

241241

242+
def json_extract_array(
243+
series: series.Series,
244+
json_path: str = "$",
245+
) -> series.Series:
246+
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
247+
values. This function uses single quotes and brackets to escape invalid JSONPath
248+
characters in JSON keys.
249+
250+
**Examples:**
251+
252+
>>> import bigframes.pandas as bpd
253+
>>> import bigframes.bigquery as bbq
254+
>>> bpd.options.display.progress_bar = None
255+
256+
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
257+
>>> bbq.json_extract_array(s)
258+
0 ['1' '2' '3']
259+
1 ['4' '5']
260+
dtype: list<item: string>[pyarrow]
261+
262+
Args:
263+
series (bigframes.series.Series):
264+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
265+
json_path (str):
266+
The JSON path identifying the data that you want to obtain from the input.
267+
268+
Returns:
269+
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
270+
"""
271+
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
272+
273+
242274
# Search functions defined from
243275
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
244276

‎bigframes/core/compile/scalar_op_compiler.py

+12
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,11 @@ def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
947947
return json_extract(json_obj=x, json_path=op.json_path)
948948

949949

950+
@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
951+
def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
952+
return json_extract_array(json_obj=x, json_path=op.json_path)
953+
954+
950955
### Binary Ops
951956
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
952957
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
@@ -1581,6 +1586,13 @@ def json_extract(
15811586
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
15821587

15831588

1589+
@ibis.udf.scalar.builtin(name="json_extract_array")
1590+
def json_extract_array(
1591+
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
1592+
) -> ibis_dtypes.Array[ibis_dtypes.String]:
1593+
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""
1594+
1595+
15841596
@ibis.udf.scalar.builtin(name="ML.DISTANCE")
15851597
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
15861598
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""

‎bigframes/operations/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,23 @@ def output_type(self, *input_types):
652652
return input_type
653653

654654

655+
@dataclasses.dataclass(frozen=True)
656+
class JSONExtractArray(UnaryOp):
657+
name: typing.ClassVar[str] = "json_extract_array"
658+
json_path: str
659+
660+
def output_type(self, *input_types):
661+
input_type = input_types[0]
662+
if not dtypes.is_json_like(input_type):
663+
raise TypeError(
664+
"Input type must be an valid JSON object or JSON-formatted string type."
665+
+ f" Received type: {input_type}"
666+
)
667+
return pd.ArrowDtype(
668+
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
669+
)
670+
671+
655672
# Binary Ops
656673
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
657674
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)

‎tests/system/small/bigquery/test_json.py

+25
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,28 @@ def test_json_extract_from_string():
139139
def test_json_extract_w_invalid_series_type():
140140
with pytest.raises(TypeError):
141141
bbq.json_extract(bpd.Series([1, 2]), "$.a")
142+
143+
144+
def test_json_extract_array_from_json_strings():
145+
s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}'])
146+
actual = bbq.json_extract_array(s, "$.a")
147+
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
148+
pd.testing.assert_series_equal(
149+
actual.to_pandas(),
150+
expected.to_pandas(),
151+
)
152+
153+
154+
def test_json_extract_array_from_array_strings():
155+
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
156+
actual = bbq.json_extract_array(s)
157+
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
158+
pd.testing.assert_series_equal(
159+
actual.to_pandas(),
160+
expected.to_pandas(),
161+
)
162+
163+
164+
def test_json_extract_array_w_invalid_series_type():
165+
with pytest.raises(TypeError):
166+
bbq.json_extract_array(bpd.Series([1, 2]))

0 commit comments

Comments
 (0)
Failed to load comments.