|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
| 15 | +import inspect |
15 | 16 | import re
|
16 | 17 |
|
17 | 18 | import google.api_core.exceptions
|
@@ -972,3 +973,112 @@ def echo_len(row):
|
972 | 973 | bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview."
|
973 | 974 | ):
|
974 | 975 | scalars_df[[column]].apply(echo_len_remote, axis=1)
|
| 976 | + |
| 977 | + |
| 978 | +@pytest.mark.flaky(retries=2, delay=120) |
| 979 | +def test_remote_function_application_repr(session, dataset_id_permanent): |
| 980 | + # This function deliberately has a param with name "name", this is to test |
| 981 | + # a specific ibis' internal handling of object names |
| 982 | + def should_mask(name: str) -> bool: |
| 983 | + hash = 0 |
| 984 | + for char_ in name: |
| 985 | + hash += ord(char_) |
| 986 | + return hash % 2 == 0 |
| 987 | + |
| 988 | + assert "name" in inspect.signature(should_mask).parameters |
| 989 | + |
| 990 | + should_mask = session.remote_function( |
| 991 | + dataset=dataset_id_permanent, name=get_rf_name(should_mask) |
| 992 | + )(should_mask) |
| 993 | + |
| 994 | + s = bigframes.series.Series(["Alice", "Bob", "Caroline"]) |
| 995 | + |
| 996 | + repr(s.apply(should_mask)) |
| 997 | + repr(s.where(s.apply(should_mask))) |
| 998 | + repr(s.where(~s.apply(should_mask))) |
| 999 | + repr(s.mask(should_mask)) |
| 1000 | + repr(s.mask(should_mask, "REDACTED")) |
| 1001 | + |
| 1002 | + |
| 1003 | +@pytest.mark.flaky(retries=2, delay=120) |
| 1004 | +def test_read_gbq_function_application_repr(session, dataset_id, scalars_df_index): |
| 1005 | + gbq_function = f"{dataset_id}.should_mask" |
| 1006 | + |
| 1007 | + # This function deliberately has a param with name "name", this is to test |
| 1008 | + # a specific ibis' internal handling of object names |
| 1009 | + session.bqclient.query_and_wait( |
| 1010 | + f"CREATE OR REPLACE FUNCTION `{gbq_function}`(name STRING) RETURNS BOOL AS (MOD(LENGTH(name), 2) = 1)" |
| 1011 | + ) |
| 1012 | + routine = session.bqclient.get_routine(gbq_function) |
| 1013 | + assert "name" in [arg.name for arg in routine.arguments] |
| 1014 | + |
| 1015 | + # read the function and apply to dataframe |
| 1016 | + should_mask = session.read_gbq_function(gbq_function) |
| 1017 | + |
| 1018 | + s = scalars_df_index["string_col"] |
| 1019 | + |
| 1020 | + repr(s.apply(should_mask)) |
| 1021 | + repr(s.where(s.apply(should_mask))) |
| 1022 | + repr(s.where(~s.apply(should_mask))) |
| 1023 | + repr(s.mask(should_mask)) |
| 1024 | + repr(s.mask(should_mask, "REDACTED")) |
| 1025 | + |
| 1026 | + |
| 1027 | +@pytest.mark.flaky(retries=2, delay=120) |
| 1028 | +def test_remote_function_apply_after_filter(session, dataset_id_permanent, scalars_dfs): |
| 1029 | + |
| 1030 | + # This function is deliberately written to not work with NA input |
| 1031 | + def plus_one(x: int) -> int: |
| 1032 | + return x + 1 |
| 1033 | + |
| 1034 | + scalars_df, scalars_pandas_df = scalars_dfs |
| 1035 | + int_col_name_with_nulls = "int64_col" |
| 1036 | + |
| 1037 | + # make sure there are NA values in the test column |
| 1038 | + assert any([pd.isna(val) for val in scalars_df[int_col_name_with_nulls]]) |
| 1039 | + |
| 1040 | + # create a remote function |
| 1041 | + plus_one_remote = session.remote_function( |
| 1042 | + dataset=dataset_id_permanent, name=get_rf_name(plus_one) |
| 1043 | + )(plus_one) |
| 1044 | + |
| 1045 | + # with nulls in the series the remote function application would fail |
| 1046 | + with pytest.raises( |
| 1047 | + google.api_core.exceptions.BadRequest, match="unsupported operand" |
| 1048 | + ): |
| 1049 | + scalars_df[int_col_name_with_nulls].apply(plus_one_remote).to_pandas() |
| 1050 | + |
| 1051 | + # after filtering out nulls the remote function application should works |
| 1052 | + # similar to pandas |
| 1053 | + pd_result = scalars_pandas_df[scalars_pandas_df[int_col_name_with_nulls].notnull()][ |
| 1054 | + int_col_name_with_nulls |
| 1055 | + ].apply(plus_one) |
| 1056 | + bf_result = ( |
| 1057 | + scalars_df[scalars_df[int_col_name_with_nulls].notnull()][ |
| 1058 | + int_col_name_with_nulls |
| 1059 | + ] |
| 1060 | + .apply(plus_one_remote) |
| 1061 | + .to_pandas() |
| 1062 | + ) |
| 1063 | + |
| 1064 | + # ignore pandas "int64" vs bigframes "Int64" dtype difference |
| 1065 | + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) |
| 1066 | + |
| 1067 | + |
| 1068 | +@pytest.mark.flaky(retries=2, delay=120) |
| 1069 | +def test_remote_function_apply_assign_partial_ordering_mode(dataset_id_permanent): |
| 1070 | + session = bigframes.Session(bigframes.BigQueryOptions(ordering_mode="partial")) |
| 1071 | + |
| 1072 | + df = session.read_gbq("bigquery-public-data.baseball.schedules")[ |
| 1073 | + ["duration_minutes"] |
| 1074 | + ] |
| 1075 | + |
| 1076 | + def plus_one(x: int) -> int: |
| 1077 | + return x + 1 |
| 1078 | + |
| 1079 | + plus_one = session.remote_function( |
| 1080 | + dataset=dataset_id_permanent, name=get_rf_name(plus_one) |
| 1081 | + )(plus_one) |
| 1082 | + |
| 1083 | + df1 = df.assign(duration_cat=df["duration_minutes"].apply(plus_one)) |
| 1084 | + repr(df1) |
0 commit comments