Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 1b613e0

Browse files
authoredJun 29, 2024
feat: bigframes.bigquery.json_set (#782)
* feat: bigframes.bigquery.json_set * add tests * fix * fixes * update to binary operator for series supports * supports more than two pairs, defines as ibis udf * add more tests
1 parent 57d98b9 commit 1b613e0

File tree

6 files changed

+242
-2
lines changed

6 files changed

+242
-2
lines changed
 

‎bigframes/bigquery/__init__.py

+54
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
import bigframes.series as series
3737

3838

39+
# Array functions defined from
40+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
41+
42+
3943
def array_length(series: series.Series) -> series.Series:
4044
"""Compute the length of each array element in the Series.
4145
@@ -154,6 +158,56 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
154158
return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
155159

156160

161+
# JSON functions defined from
162+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
163+
164+
165+
def json_set(
166+
series: series.Series,
167+
json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]],
168+
) -> series.Series:
169+
"""Produces a new JSON value within a Series by inserting or replacing values at
170+
specified paths.
171+
172+
**Examples:**
173+
174+
>>> import bigframes.pandas as bpd
175+
>>> import bigframes.bigquery as bbq
176+
>>> import numpy as np
177+
>>> bpd.options.display.progress_bar = None
178+
179+
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
180+
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
181+
0 {"a":100,"b":"hi"}
182+
Name: data, dtype: string
183+
184+
Args:
185+
series (bigframes.series.Series):
186+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
187+
json_path_value_pairs (Sequence[Tuple[str, typing.Any]]):
188+
Pairs of JSON path and the new value to insert/replace.
189+
190+
Returns:
191+
bigframes.series.Series: A new Series with the transformed JSON data.
192+
193+
"""
194+
# SQLGlot parser does not support the "create_if_missing => true" syntax, so
195+
# create_if_missing is not currently implemented.
196+
197+
for json_path_value_pair in json_path_value_pairs:
198+
if len(json_path_value_pair) != 2:
199+
raise ValueError(
200+
"Incorrect format: Expected (<json_path>, <json_value>), but found: "
201+
+ f"{json_path_value_pair}"
202+
)
203+
204+
json_path, json_value = json_path_value_pair
205+
series = series._apply_binary_op(
206+
json_value, ops.JSONSet(json_path=json_path), alignment="left"
207+
)
208+
return series
209+
210+
157211
def vector_search(
158212
base_table: str,
159213
column_to_search: str,

‎bigframes/core/compile/scalar_op_compiler.py

+32
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,26 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp):
894894
return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter)
895895

896896

897+
# JSON Ops
898+
@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
899+
def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
900+
if x.type().is_json():
901+
return json_set(
902+
json_obj=x,
903+
json_path=op.json_path,
904+
json_value=y,
905+
).to_expr()
906+
else:
907+
# Enabling JSON type eliminates the need for less efficient string conversions.
908+
return vendored_ibis_ops.ToJsonString(
909+
json_set(
910+
json_obj=parse_json(x),
911+
json_path=op.json_path,
912+
json_value=y,
913+
)
914+
).to_expr()
915+
916+
897917
### Binary Ops
898918
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
899919
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
@@ -1469,3 +1489,15 @@ def float_floor(a: float) -> float:
14691489
def float_ceil(a: float) -> float:
14701490
"""Convert string to timestamp."""
14711491
return 0 # pragma: NO COVER
1492+
1493+
1494+
@ibis.udf.scalar.builtin(name="parse_json")
1495+
def parse_json(a: str) -> ibis_dtypes.JSON:
1496+
"""Converts a JSON-formatted STRING value to a JSON value."""
1497+
1498+
1499+
@ibis.udf.scalar.builtin(name="json_set")
1500+
def json_set(
1501+
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str, json_value
1502+
) -> ibis_dtypes.JSON:
1503+
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""

‎bigframes/dtypes.py

+11
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,17 @@ def is_struct_like(type: ExpressionType) -> bool:
240240
)
241241

242242

243+
def is_json_like(type: ExpressionType) -> bool:
244+
# TODO: Add JSON type support
245+
return type == STRING_DTYPE
246+
247+
248+
def is_json_encoding_type(type: ExpressionType) -> bool:
249+
# Types can be converted into JSON.
250+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_encodings
251+
return type != GEO_DTYPE
252+
253+
243254
def is_numeric(type: ExpressionType) -> bool:
244255
return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
245256

‎bigframes/operations/__init__.py

+24
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,30 @@ def output_type(self, *input_types):
707707
strconcat_op = StrConcatOp()
708708

709709

710+
## JSON Ops
711+
@dataclasses.dataclass(frozen=True)
712+
class JSONSet(BinaryOp):
713+
name: typing.ClassVar[str] = "json_set"
714+
json_path: str
715+
716+
def output_type(self, *input_types):
717+
left_type = input_types[0]
718+
right_type = input_types[1]
719+
if not dtypes.is_json_like(left_type):
720+
raise TypeError(
721+
"Input type must be an valid JSON object or JSON-formatted string type."
722+
+ f" Received type: {left_type}"
723+
)
724+
if not dtypes.is_json_encoding_type(right_type):
725+
raise TypeError(
726+
"The value to be assigned must be a type that can be encoded as JSON."
727+
+ f"Received type: {right_type}"
728+
)
729+
730+
# After JSON type implementation, ONLY return JSON data.
731+
return left_type
732+
733+
710734
# Ternary Ops
711735
@dataclasses.dataclass(frozen=True)
712736
class WhereOp(TernaryOp):
+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
17+
import geopandas as gpd # type: ignore
18+
import pandas as pd
19+
import pytest
20+
21+
import bigframes.bigquery as bbq
22+
import bigframes.pandas as bpd
23+
24+
25+
def _get_series_from_json(json_data):
26+
sql = " UNION ALL ".join(
27+
[
28+
f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data"
29+
for id, data in enumerate(json_data)
30+
]
31+
)
32+
df = bpd.read_gbq(sql).set_index("id").sort_index()
33+
return df["data"]
34+
35+
36+
@pytest.mark.parametrize(
37+
("json_path", "expected_json"),
38+
[
39+
pytest.param("$.a", [{"a": 10}], id="simple"),
40+
pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"),
41+
],
42+
)
43+
def test_json_set_at_json_path(json_path, expected_json):
44+
s = _get_series_from_json([{"a": {"b": {"c": "tester", "d": []}}}])
45+
actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])
46+
47+
expected = _get_series_from_json(expected_json)
48+
pd.testing.assert_series_equal(
49+
actual.to_pandas(),
50+
expected.to_pandas(),
51+
)
52+
53+
54+
@pytest.mark.parametrize(
55+
("json_value", "expected_json"),
56+
[
57+
pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"),
58+
pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"),
59+
pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"),
60+
pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"),
61+
],
62+
)
63+
def test_json_set_at_json_value_type(json_value, expected_json):
64+
s = _get_series_from_json([{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}])
65+
actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)])
66+
67+
expected = _get_series_from_json(expected_json)
68+
pd.testing.assert_series_equal(
69+
actual.to_pandas(),
70+
expected.to_pandas(),
71+
)
72+
73+
74+
def test_json_set_w_more_pairs():
75+
s = _get_series_from_json([{"a": 2}, {"b": 5}, {"c": 1}])
76+
actual = bbq.json_set(
77+
s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]
78+
)
79+
expected = _get_series_from_json(
80+
[{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}]
81+
)
82+
pd.testing.assert_series_equal(
83+
actual.to_pandas(),
84+
expected.to_pandas(),
85+
)
86+
87+
88+
@pytest.mark.parametrize(
89+
("series", "json_path_value_pairs"),
90+
[
91+
pytest.param(
92+
_get_series_from_json([{"a": 10}]),
93+
[("$.a", 1, 100)],
94+
id="invalid_json_path_value_pairs",
95+
marks=pytest.mark.xfail(raises=ValueError),
96+
),
97+
pytest.param(
98+
_get_series_from_json([{"a": 10}]),
99+
[
100+
(
101+
"$.a",
102+
bpd.read_pandas(
103+
gpd.GeoSeries.from_wkt(["POINT (1 2)", "POINT (2 1)"])
104+
),
105+
)
106+
],
107+
id="invalid_json_value_type",
108+
marks=pytest.mark.xfail(raises=TypeError),
109+
),
110+
pytest.param(
111+
bpd.Series([1, 2]),
112+
[("$.a", 1)],
113+
id="invalid_series_type",
114+
marks=pytest.mark.xfail(raises=TypeError),
115+
),
116+
],
117+
)
118+
def test_json_set_w_invalid(series, json_path_value_pairs):
119+
bbq.json_set(series, json_path_value_pairs=json_path_value_pairs)

‎third_party/bigframes_vendored/ibis/expr/operations/json.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from __future__ import annotations
33

44
import ibis.expr.datatypes as dt
5-
from ibis.expr.operations.core import Unary
5+
import ibis.expr.operations.core as ibis_ops_core
66

77

8-
class ToJsonString(Unary):
8+
class ToJsonString(ibis_ops_core.Unary):
99
dtype = dt.string

0 commit comments

Comments
 (0)
Failed to load comments.