Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 208e081

Browse files
authoredFeb 12, 2024
feat: limited support of lamdas in Series.apply (#345)
BEGIN_COMMIT_OVERRIDE feat: limited support of lambdas in `Series.apply` (#345) END_COMMIT_OVERRIDE Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated https://screenshot.googleplex.com/6ZEiKXPz8LWMTRf Partially fixes internal issue 295964341 🦕
1 parent ffb0d15 commit 208e081

File tree

3 files changed

+270
-8
lines changed

3 files changed

+270
-8
lines changed
 

‎bigframes/series.py

+43-2
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@
5858
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
5959

6060

61+
_remote_function_recommendation_message = (
62+
"Your functions could not be applied directly to the Series."
63+
" Try converting it to a remote function."
64+
)
65+
66+
6167
@log_adapter.class_logger
6268
class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series):
6369
def __init__(self, *args, **kwargs):
@@ -1210,12 +1216,43 @@ def _groupby_values(
12101216
dropna=dropna,
12111217
)
12121218

1213-
def apply(self, func) -> Series:
1219+
def apply(
1220+
self, func, by_row: typing.Union[typing.Literal["compat"], bool] = "compat"
1221+
) -> Series:
12141222
# TODO(shobs, b/274645634): Support convert_dtype, args, **kwargs
12151223
# is actually a ternary op
12161224
# Reproject as workaround to applying filter too late. This forces the filter
12171225
# to be applied before passing data to remote function, protecting from bad
12181226
# inputs causing errors.
1227+
1228+
if by_row not in ["compat", False]:
1229+
raise ValueError("Param by_row must be one of 'compat' or False")
1230+
1231+
if not callable(func):
1232+
raise ValueError(
1233+
"Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported."
1234+
)
1235+
1236+
if not hasattr(func, "bigframes_remote_function"):
1237+
# It is not a remote function
1238+
# Then it must be a vectorized function that applies to the Series
1239+
# as a whole
1240+
if by_row:
1241+
raise ValueError(
1242+
"A vectorized non-remote function can be provided only with by_row=False."
1243+
" For element-wise operation it must be a remote function."
1244+
)
1245+
1246+
try:
1247+
return func(self)
1248+
except Exception as ex:
1249+
# This could happen if any of the operators in func is not
1250+
# supported on a Series. Let's guide the customer to use a
1251+
# remote function instead
1252+
if hasattr(ex, "message"):
1253+
ex.message += f"\n{_remote_function_recommendation_message}"
1254+
raise
1255+
12191256
reprojected_series = Series(self._block._force_reproject())
12201257
return reprojected_series._apply_unary_op(
12211258
ops.RemoteFunctionOp(func=func, apply_on_null=True)
@@ -1325,7 +1362,11 @@ def duplicated(self, keep: str = "first") -> Series:
13251362

13261363
def mask(self, cond, other=None) -> Series:
13271364
if callable(cond):
1328-
cond = self.apply(cond)
1365+
if hasattr(cond, "bigframes_remote_function"):
1366+
cond = self.apply(cond)
1367+
else:
1368+
# For non-remote function assume that it is applicable on Series
1369+
cond = self.apply(cond, by_row=False)
13291370

13301371
if not isinstance(cond, Series):
13311372
raise TypeError(

‎tests/system/small/test_series.py

+151
Original file line numberDiff line numberDiff line change
@@ -2560,6 +2560,51 @@ def test_mask_custom_value(scalars_dfs):
25602560
assert_pandas_df_equal(bf_result, pd_result)
25612561

25622562

2563+
@pytest.mark.parametrize(
2564+
("lambda_",),
2565+
[
2566+
pytest.param(lambda x: x > 0),
2567+
pytest.param(
2568+
lambda x: True if x > 0 else False,
2569+
marks=pytest.mark.xfail(
2570+
raises=ValueError,
2571+
),
2572+
),
2573+
],
2574+
ids=[
2575+
"lambda_arithmatic",
2576+
"lambda_arbitrary",
2577+
],
2578+
)
2579+
def test_mask_lambda(scalars_dfs, lambda_):
2580+
scalars_df, scalars_pandas_df = scalars_dfs
2581+
2582+
bf_col = scalars_df["int64_col"]
2583+
bf_result = bf_col.mask(lambda_).to_pandas()
2584+
2585+
pd_col = scalars_pandas_df["int64_col"]
2586+
pd_result = pd_col.mask(lambda_)
2587+
2588+
# ignore dtype check, which are Int64 and object respectively
2589+
assert_series_equal(bf_result, pd_result, check_dtype=False)
2590+
2591+
2592+
def test_mask_simple_udf(scalars_dfs):
2593+
scalars_df, scalars_pandas_df = scalars_dfs
2594+
2595+
def foo(x):
2596+
return x < 1000000
2597+
2598+
bf_col = scalars_df["int64_col"]
2599+
bf_result = bf_col.mask(foo).to_pandas()
2600+
2601+
pd_col = scalars_pandas_df["int64_col"]
2602+
pd_result = pd_col.mask(foo)
2603+
2604+
# ignore dtype check, which are Int64 and object respectively
2605+
assert_series_equal(bf_result, pd_result, check_dtype=False)
2606+
2607+
25632608
@pytest.mark.parametrize(
25642609
("column", "to_type"),
25652610
[
@@ -3042,3 +3087,109 @@ def test_series_iter(
30423087
scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"]
30433088
):
30443089
assert bf_i == pd_i
3090+
3091+
3092+
@pytest.mark.parametrize(
3093+
(
3094+
"col",
3095+
"lambda_",
3096+
),
3097+
[
3098+
pytest.param("int64_col", lambda x: x * x + x + 1),
3099+
pytest.param("int64_col", lambda x: x % 2 == 1),
3100+
pytest.param("string_col", lambda x: x + "_suffix"),
3101+
],
3102+
ids=[
3103+
"lambda_int_int",
3104+
"lambda_int_bool",
3105+
"lambda_str_str",
3106+
],
3107+
)
3108+
def test_apply_lambda(scalars_dfs, col, lambda_):
3109+
scalars_df, scalars_pandas_df = scalars_dfs
3110+
3111+
bf_col = scalars_df[col]
3112+
3113+
# Can't be applied to BigFrames Series without by_row=False
3114+
with pytest.raises(ValueError, match="by_row=False"):
3115+
bf_col.apply(lambda_)
3116+
3117+
bf_result = bf_col.apply(lambda_, by_row=False).to_pandas()
3118+
3119+
pd_col = scalars_pandas_df[col]
3120+
pd_result = pd_col.apply(lambda_)
3121+
3122+
# ignore dtype check, which are Int64 and object respectively
3123+
assert_series_equal(bf_result, pd_result, check_dtype=False)
3124+
3125+
3126+
@pytest.mark.parametrize(
3127+
("ufunc",),
3128+
[
3129+
pytest.param(numpy.log),
3130+
pytest.param(numpy.sqrt),
3131+
pytest.param(numpy.sin),
3132+
],
3133+
ids=[
3134+
"log",
3135+
"sqrt",
3136+
"sin",
3137+
],
3138+
)
3139+
def test_apply_numpy_ufunc(scalars_dfs, ufunc):
3140+
scalars_df, scalars_pandas_df = scalars_dfs
3141+
3142+
bf_col = scalars_df["int64_col"]
3143+
3144+
# Can't be applied to BigFrames Series without by_row=False
3145+
with pytest.raises(ValueError, match="by_row=False"):
3146+
bf_col.apply(ufunc)
3147+
3148+
bf_result = bf_col.apply(ufunc, by_row=False).to_pandas()
3149+
3150+
pd_col = scalars_pandas_df["int64_col"]
3151+
pd_result = pd_col.apply(ufunc)
3152+
3153+
assert_series_equal(bf_result, pd_result)
3154+
3155+
3156+
def test_apply_simple_udf(scalars_dfs):
3157+
scalars_df, scalars_pandas_df = scalars_dfs
3158+
3159+
def foo(x):
3160+
return x * x + 2 * x + 3
3161+
3162+
bf_col = scalars_df["int64_col"]
3163+
3164+
# Can't be applied to BigFrames Series without by_row=False
3165+
with pytest.raises(ValueError, match="by_row=False"):
3166+
bf_col.apply(foo)
3167+
3168+
bf_result = bf_col.apply(foo, by_row=False).to_pandas()
3169+
3170+
pd_col = scalars_pandas_df["int64_col"]
3171+
pd_result = pd_col.apply(foo)
3172+
3173+
# ignore dtype check, which are Int64 and object respectively
3174+
assert_series_equal(bf_result, pd_result, check_dtype=False)
3175+
3176+
3177+
@pytest.mark.parametrize(
3178+
("col", "lambda_", "exception"),
3179+
[
3180+
pytest.param("int64_col", {1: 2, 3: 4}, ValueError),
3181+
pytest.param("int64_col", numpy.square, TypeError),
3182+
pytest.param("string_col", lambda x: x.capitalize(), AttributeError),
3183+
],
3184+
ids=[
3185+
"not_callable",
3186+
"numpy_ufunc",
3187+
"custom_lambda",
3188+
],
3189+
)
3190+
def test_apply_not_supported(scalars_dfs, col, lambda_, exception):
3191+
scalars_df, _ = scalars_dfs
3192+
3193+
bf_col = scalars_df[col]
3194+
with pytest.raises(exception):
3195+
bf_col.apply(lambda_, by_row=False)

‎third_party/bigframes_vendored/pandas/core/series.py

+76-6
Original file line numberDiff line numberDiff line change
@@ -1116,18 +1116,24 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
11161116
def apply(
11171117
self,
11181118
func,
1119+
by_row="compat",
11191120
) -> DataFrame | Series:
11201121
"""
11211122
Invoke function on values of a Series.
11221123
1124+
Can be ufunc (a NumPy function that applies to the entire Series) or a
1125+
Python function that only works on single values. If it is an arbitrary
1126+
python function then converting it into a `remote_function` is recommended.
1127+
11231128
**Examples:**
11241129
11251130
>>> import bigframes.pandas as bpd
11261131
>>> bpd.options.display.progress_bar = None
11271132
1128-
Let's use ``reuse=False`` flag to make sure a new ``remote_function``
1133+
For applying arbitrary python function a `remote_funciton` is recommended.
1134+
Let's use ``reuse=False`` flag to make sure a new `remote_function`
11291135
is created every time we run the following code, but you can skip it
1130-
to potentially reuse a previously deployed ``remote_function`` from
1136+
to potentially reuse a previously deployed `remote_function` from
11311137
the same user defined function.
11321138
11331139
>>> @bpd.remote_function([int], float, reuse=False)
@@ -1152,9 +1158,9 @@ def apply(
11521158
4 2.0
11531159
dtype: Float64
11541160
1155-
You could turn a user defined function with external package
1156-
dependencies into a BigQuery DataFrames remote function. You would
1157-
provide the names of the packages via ``packages`` param.
1161+
To turn a user defined function with external package dependencies into
1162+
a `remote_function`, you would provide the names of the packages via
1163+
`packages` param.
11581164
11591165
>>> @bpd.remote_function(
11601166
... [str],
@@ -1176,11 +1182,48 @@ def apply(
11761182
>>> names = bpd.Series(["Alice", "Bob"])
11771183
>>> hashes = names.apply(get_hash)
11781184
1185+
Simple vectorized functions, lambdas or ufuncs can be applied directly
1186+
with `by_row=False`.
1187+
1188+
>>> nums = bpd.Series([1, 2, 3, 4])
1189+
>>> nums
1190+
0 1
1191+
1 2
1192+
2 3
1193+
3 4
1194+
dtype: Int64
1195+
>>> nums.apply(lambda x: x*x + 2*x + 1, by_row=False)
1196+
0 4
1197+
1 9
1198+
2 16
1199+
3 25
1200+
dtype: Int64
1201+
1202+
>>> def is_odd(num):
1203+
... return num % 2 == 1
1204+
>>> nums.apply(is_odd, by_row=False)
1205+
0 True
1206+
1 False
1207+
2 True
1208+
3 False
1209+
dtype: boolean
1210+
1211+
>>> nums.apply(np.log, by_row=False)
1212+
0 0.0
1213+
1 0.693147
1214+
2 1.098612
1215+
3 1.386294
1216+
dtype: Float64
1217+
11791218
Args:
11801219
func (function):
11811220
BigFrames DataFrames ``remote_function`` to apply. The function
11821221
should take a scalar and return a scalar. It will be applied to
11831222
every element in the ``Series``.
1223+
by_row (False or "compat", default "compat"):
1224+
If `"compat"` , func must be a remote function which will be
1225+
passed each element of the Series, like `Series.map`. If False,
1226+
the func will be passed the whole Series at once.
11841227
11851228
Returns:
11861229
bigframes.series.Series: A new Series with values representing the
@@ -2680,7 +2723,8 @@ def mask(self, cond, other):
26802723
dtype: Int64
26812724
26822725
You can mask the values in the Series based on a condition. The values
2683-
matching the condition would be masked.
2726+
matching the condition would be masked. The condition can be provided in
2727+
formm of a Series.
26842728
26852729
>>> s.mask(s % 2 == 0)
26862730
0 <NA>
@@ -2736,6 +2780,32 @@ def mask(self, cond, other):
27362780
2 Caroline
27372781
dtype: string
27382782
2783+
Simple vectorized (i.e. they only perform operations supported on a
2784+
Series) lambdas or python functions can be used directly.
2785+
2786+
>>> nums = bpd.Series([1, 2, 3, 4], name="nums")
2787+
>>> nums
2788+
0 1
2789+
1 2
2790+
2 3
2791+
3 4
2792+
Name: nums, dtype: Int64
2793+
>>> nums.mask(lambda x: (x+1) % 2 == 1)
2794+
0 1
2795+
1 <NA>
2796+
2 3
2797+
3 <NA>
2798+
Name: nums, dtype: Int64
2799+
2800+
>>> def is_odd(num):
2801+
... return num % 2 == 1
2802+
>>> nums.mask(is_odd)
2803+
0 <NA>
2804+
1 2
2805+
2 <NA>
2806+
3 4
2807+
Name: nums, dtype: Int64
2808+
27392809
Args:
27402810
cond (bool Series/DataFrame, array-like, or callable):
27412811
Where cond is False, keep the original value. Where True, replace

0 commit comments

Comments
 (0)
Failed to load comments.