Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c94ead9

Browse files
authoredSep 12, 2024
chore: apply remote_function on the original series without reprojecting (#874)
* chore: apply `remote_function` on the original series This change tests application of remote function without reprojecting the original series. * add failing mask doctest as a system test for easier debugging * more comprehensive repr tests * more tests, move to small tests * rename "name" param * manipulate copy of the original udf * move the funciton copy after i/o types resolution * rename all params to avoid collisions, widely use bigframes_ prefix for consistency
1 parent 40113d8 commit c94ead9

File tree

4 files changed

+138
-9
lines changed

4 files changed

+138
-9
lines changed
 

‎bigframes/functions/_remote_function_session.py

+18
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union
2323
import warnings
2424

25+
import cloudpickle
2526
import google.api_core.exceptions
2627
from google.cloud import (
2728
bigquery,
@@ -458,6 +459,11 @@ def wrapper(func):
458459
session=session, # type: ignore
459460
)
460461

462+
# To respect the user code/environment let's use a copy of the
463+
# original udf, especially since we would be setting some properties
464+
# on it
465+
func = cloudpickle.loads(cloudpickle.dumps(func))
466+
461467
# In the unlikely case where the user is trying to re-deploy the same
462468
# function, cleanup the attributes we add below, first. This prevents
463469
# the pickle from having dependencies that might not otherwise be
@@ -499,6 +505,18 @@ def try_delattr(attr):
499505
cloud_function_memory_mib=cloud_function_memory_mib,
500506
)
501507

508+
# TODO(shobs): Find a better way to support udfs with param named "name".
509+
# This causes an issue in the ibis compilation.
510+
func.__signature__ = inspect.signature(func).replace( # type: ignore
511+
parameters=[
512+
inspect.Parameter(
513+
f"bigframes_{param.name}",
514+
param.kind,
515+
)
516+
for param in inspect.signature(func).parameters.values()
517+
]
518+
)
519+
502520
# TODO: Move ibis logic to compiler step
503521
node = ibis.udf.scalar.builtin(
504522
func,

‎bigframes/functions/remote_function.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,21 @@ def read_gbq_function(
144144

145145
# The name "args" conflicts with the Ibis operator, so we use
146146
# non-standard names for the arguments here.
147-
def func(*ignored_args, **ignored_kwargs):
147+
def func(*bigframes_args, **bigframes_kwargs):
148148
f"""Remote function {str(routine_ref)}."""
149149
nonlocal node # type: ignore
150150

151-
expr = node(*ignored_args, **ignored_kwargs) # type: ignore
151+
expr = node(*bigframes_args, **bigframes_kwargs) # type: ignore
152152
return ibis_client.execute(expr)
153153

154154
func.__signature__ = inspect.signature(func).replace( # type: ignore
155155
parameters=[
156-
inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD)
156+
# TODO(shobs): Find a better way to support functions with param
157+
# named "name". This causes an issue in the ibis compilation.
158+
inspect.Parameter(
159+
f"bigframes_{name}",
160+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
161+
)
157162
for name in ibis_signature.parameter_names
158163
]
159164
)

‎bigframes/series.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -1481,12 +1481,8 @@ def apply(
14811481
ex.message += f"\n{_remote_function_recommendation_message}"
14821482
raise
14831483

1484-
# We are working with remote function at this point.
1485-
# Reproject as workaround to applying filter too late. This forces the
1486-
# filter to be applied before passing data to remote function,
1487-
# protecting from bad inputs causing errors.
1488-
reprojected_series = Series(self._block._force_reproject())
1489-
result_series = reprojected_series._apply_unary_op(
1484+
# We are working with remote function at this point
1485+
result_series = self._apply_unary_op(
14901486
ops.RemoteFunctionOp(func=func, apply_on_null=True)
14911487
)
14921488

‎tests/system/small/test_remote_function.py

+110
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import inspect
1516
import re
1617

1718
import google.api_core.exceptions
@@ -972,3 +973,112 @@ def echo_len(row):
972973
bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview."
973974
):
974975
scalars_df[[column]].apply(echo_len_remote, axis=1)
976+
977+
978+
@pytest.mark.flaky(retries=2, delay=120)
979+
def test_remote_function_application_repr(session, dataset_id_permanent):
980+
# This function deliberately has a param with name "name", this is to test
981+
# a specific ibis' internal handling of object names
982+
def should_mask(name: str) -> bool:
983+
hash = 0
984+
for char_ in name:
985+
hash += ord(char_)
986+
return hash % 2 == 0
987+
988+
assert "name" in inspect.signature(should_mask).parameters
989+
990+
should_mask = session.remote_function(
991+
dataset=dataset_id_permanent, name=get_rf_name(should_mask)
992+
)(should_mask)
993+
994+
s = bigframes.series.Series(["Alice", "Bob", "Caroline"])
995+
996+
repr(s.apply(should_mask))
997+
repr(s.where(s.apply(should_mask)))
998+
repr(s.where(~s.apply(should_mask)))
999+
repr(s.mask(should_mask))
1000+
repr(s.mask(should_mask, "REDACTED"))
1001+
1002+
1003+
@pytest.mark.flaky(retries=2, delay=120)
1004+
def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index):
1005+
gbq_function = f"{dataset_id}.should_mask"
1006+
1007+
# This function deliberately has a param with name "name", this is to test
1008+
# a specific ibis' internal handling of object names
1009+
session.bqclient.query_and_wait(
1010+
f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)"
1011+
)
1012+
routine = session.bqclient.get_routine(gbq_function)
1013+
assert "name" in [arg.name for arg in routine.arguments]
1014+
1015+
# read the function and apply to dataframe
1016+
should_mask = session.read_gbq_function(gbq_function)
1017+
1018+
s = scalars_df_index["string_col"]
1019+
1020+
repr(s.apply(should_mask))
1021+
repr(s.where(s.apply(should_mask)))
1022+
repr(s.where(~s.apply(should_mask)))
1023+
repr(s.mask(should_mask))
1024+
repr(s.mask(should_mask, "REDACTED"))
1025+
1026+
1027+
@pytest.mark.flaky(retries=2, delay=120)
1028+
def test_remote_function_apply_after_filter(session, dataset_id_permanent, scalars_dfs):
1029+
1030+
# This function is deliberately written to not work with NA input
1031+
def plus_one(x: int) -> int:
1032+
return x + 1
1033+
1034+
scalars_df, scalars_pandas_df = scalars_dfs
1035+
int_col_name_with_nulls = "int64_col"
1036+
1037+
# make sure there are NA values in the test column
1038+
assert any([pd.isna(val) for val in scalars_df[int_col_name_with_nulls]])
1039+
1040+
# create a remote function
1041+
plus_one_remote = session.remote_function(
1042+
dataset=dataset_id_permanent, name=get_rf_name(plus_one)
1043+
)(plus_one)
1044+
1045+
# with nulls in the series the remote function application would fail
1046+
with pytest.raises(
1047+
google.api_core.exceptions.BadRequest, match="unsupported operand"
1048+
):
1049+
scalars_df[int_col_name_with_nulls].apply(plus_one_remote).to_pandas()
1050+
1051+
# after filtering out nulls the remote function application should works
1052+
# similar to pandas
1053+
pd_result = scalars_pandas_df[scalars_pandas_df[int_col_name_with_nulls].notnull()][
1054+
int_col_name_with_nulls
1055+
].apply(plus_one)
1056+
bf_result = (
1057+
scalars_df[scalars_df[int_col_name_with_nulls].notnull()][
1058+
int_col_name_with_nulls
1059+
]
1060+
.apply(plus_one_remote)
1061+
.to_pandas()
1062+
)
1063+
1064+
# ignore pandas "int64" vs bigframes "Int64" dtype difference
1065+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
1066+
1067+
1068+
@pytest.mark.flaky(retries=2, delay=120)
1069+
def test_remote_function_apply_assign_partial_ordering_mode(dataset_id_permanent):
1070+
session = bigframes.Session(bigframes.BigQueryOptions(ordering_mode="partial"))
1071+
1072+
df = session.read_gbq("bigquery-public-data.baseball.schedules")[
1073+
["duration_minutes"]
1074+
]
1075+
1076+
def plus_one(x: int) -> int:
1077+
return x + 1
1078+
1079+
plus_one = session.remote_function(
1080+
dataset=dataset_id_permanent, name=get_rf_name(plus_one)
1081+
)(plus_one)
1082+
1083+
df1 = df.assign(duration_cat=df["duration_minutes"].apply(plus_one))
1084+
repr(df1)

0 commit comments

Comments
 (0)
Failed to load comments.