Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ec10c4a

Browse files
authoredOct 17, 2023
feat: Support external packages in remote_function (#98)
* feat: Support external packages in `remote_function` * Update code sample demonstrating external packages for `remote_function` * GCF customization for hackathon
1 parent 752b01f commit ec10c4a

File tree

5 files changed

+103
-19
lines changed

5 files changed

+103
-19
lines changed
 

‎bigframes/pandas/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ def remote_function(
400400
bigquery_connection: Optional[str] = None,
401401
reuse: bool = True,
402402
name: Optional[str] = None,
403+
packages: Optional[Sequence[str]] = None,
403404
):
404405
return global_session.with_default_session(
405406
bigframes.session.Session.remote_function,
@@ -409,6 +410,7 @@ def remote_function(
409410
bigquery_connection=bigquery_connection,
410411
reuse=reuse,
411412
name=name,
413+
packages=packages,
412414
)
413415

414416

‎bigframes/remote_function.py

+37-12
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,12 @@ def get_remote_function_locations(bq_location):
100100
return bq_location, cloud_function_region
101101

102102

103-
def _get_hash(def_):
103+
def _get_hash(def_, package_requirements=None):
104104
"Get hash (32 digits alphanumeric) of a function."
105105
def_repr = cloudpickle.dumps(def_, protocol=_pickle_protocol_version)
106+
if package_requirements:
107+
for p in sorted(package_requirements):
108+
def_repr += p.encode()
106109
return hashlib.md5(def_repr).hexdigest()
107110

108111

@@ -129,18 +132,18 @@ class IbisSignature(NamedTuple):
129132
output_type: IbisDataType
130133

131134

132-
def get_cloud_function_name(def_, uniq_suffix=None):
135+
def get_cloud_function_name(def_, uniq_suffix=None, package_requirements=None):
133136
"Get a name for the cloud function for the given user defined function."
134-
cf_name = _get_hash(def_)
137+
cf_name = _get_hash(def_, package_requirements)
135138
cf_name = f"bigframes-{cf_name}" # for identification
136139
if uniq_suffix:
137140
cf_name = f"{cf_name}-{uniq_suffix}"
138141
return cf_name
139142

140143

141-
def get_remote_function_name(def_, uniq_suffix=None):
144+
def get_remote_function_name(def_, uniq_suffix=None, package_requirements=None):
142145
"Get a name for the BQ remote function for the given user defined function."
143-
bq_rf_name = _get_hash(def_)
146+
bq_rf_name = _get_hash(def_, package_requirements)
144147
bq_rf_name = f"bigframes_{bq_rf_name}" # for identification
145148
if uniq_suffix:
146149
bq_rf_name = f"{bq_rf_name}_{uniq_suffix}"
@@ -200,7 +203,8 @@ def create_bq_remote_function(
200203
RETURNS {bq_function_return_type}
201204
REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`
202205
OPTIONS (
203-
endpoint = "{endpoint}"
206+
endpoint = "{endpoint}",
207+
max_batching_rows = 1000
204208
)"""
205209

206210
logger.info(f"Creating BQ remote function: {create_function_ddl}")
@@ -320,11 +324,14 @@ def {handler_func_name}(request):
320324

321325
return handler_func_name
322326

323-
def generate_cloud_function_code(self, def_, dir):
327+
def generate_cloud_function_code(self, def_, dir, package_requirements=None):
324328
"""Generate the cloud function code for a given user defined function."""
325329

326330
# requirements.txt
327331
requirements = ["cloudpickle >= 2.1.0"]
332+
if package_requirements:
333+
requirements.extend(package_requirements)
334+
requirements = sorted(requirements)
328335
requirements_txt = os.path.join(dir, "requirements.txt")
329336
with open(requirements_txt, "w") as f:
330337
f.write("\n".join(requirements))
@@ -333,12 +340,14 @@ def generate_cloud_function_code(self, def_, dir):
333340
entry_point = self.generate_cloud_function_main_code(def_, dir)
334341
return entry_point
335342

336-
def create_cloud_function(self, def_, cf_name):
343+
def create_cloud_function(self, def_, cf_name, package_requirements=None):
337344
"""Create a cloud function from the given user defined function."""
338345

339346
# Build and deploy folder structure containing cloud function
340347
with tempfile.TemporaryDirectory() as dir:
341-
entry_point = self.generate_cloud_function_code(def_, dir)
348+
entry_point = self.generate_cloud_function_code(
349+
def_, dir, package_requirements
350+
)
342351
archive_path = shutil.make_archive(dir, "zip", dir)
343352

344353
# We are creating cloud function source code from the currently running
@@ -392,6 +401,9 @@ def create_cloud_function(self, def_, cf_name):
392401
function.build_config.source.storage_source.object_ = (
393402
upload_url_response.storage_source.object_
394403
)
404+
function.service_config = functions_v2.ServiceConfig()
405+
function.service_config.available_memory = "1024M"
406+
function.service_config.timeout_seconds = 600
395407
create_function_request.function = function
396408

397409
# Create the cloud function and wait for it to be ready to use
@@ -422,6 +434,7 @@ def provision_bq_remote_function(
422434
output_type,
423435
reuse,
424436
name,
437+
package_requirements,
425438
):
426439
"""Provision a BigQuery remote function."""
427440
# If reuse of any existing function with the same name (indicated by the
@@ -435,19 +448,25 @@ def provision_bq_remote_function(
435448

436449
# Derive the name of the cloud function underlying the intended BQ
437450
# remote function
438-
cloud_function_name = get_cloud_function_name(def_, uniq_suffix)
451+
cloud_function_name = get_cloud_function_name(
452+
def_, uniq_suffix, package_requirements
453+
)
439454
cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name)
440455

441456
# Create the cloud function if it does not exist
442457
if not cf_endpoint:
443-
cf_endpoint = self.create_cloud_function(def_, cloud_function_name)
458+
cf_endpoint = self.create_cloud_function(
459+
def_, cloud_function_name, package_requirements
460+
)
444461
else:
445462
logger.info(f"Cloud function {cloud_function_name} already exists.")
446463

447464
# Derive the name of the remote function
448465
remote_function_name = name
449466
if not remote_function_name:
450-
remote_function_name = get_remote_function_name(def_, uniq_suffix)
467+
remote_function_name = get_remote_function_name(
468+
def_, uniq_suffix, package_requirements
469+
)
451470
rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name)
452471

453472
# Create the BQ remote function in following circumstances:
@@ -619,6 +638,7 @@ def remote_function(
619638
bigquery_connection: Optional[str] = None,
620639
reuse: bool = True,
621640
name: Optional[str] = None,
641+
packages: Optional[Sequence[str]] = None,
622642
):
623643
"""Decorator to turn a user defined function into a BigQuery remote function.
624644
@@ -710,6 +730,10 @@ def remote_function(
710730
caution, because two users working in the same project and dataset
711731
could overwrite each other's remote functions if they use the same
712732
persistent name.
733+
packages (str[], Optional):
734+
Explicit name of the external package dependencies. Each dependency
735+
is added to the `requirements.txt` as is, and can be of the form
736+
supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/.
713737
714738
"""
715739
import bigframes.pandas as bpd
@@ -821,6 +845,7 @@ def wrapper(f):
821845
ibis_signature.output_type,
822846
reuse,
823847
name,
848+
packages,
824849
)
825850

826851
node = remote_function_node(dataset_ref.routine(rf_name), ibis_signature)

‎bigframes/session.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1413,6 +1413,7 @@ def remote_function(
14131413
bigquery_connection: Optional[str] = None,
14141414
reuse: bool = True,
14151415
name: Optional[str] = None,
1416+
packages: Optional[Sequence[str]] = None,
14161417
):
14171418
"""Decorator to turn a user defined function into a BigQuery remote function. Check out
14181419
the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
@@ -1467,7 +1468,7 @@ def remote_function(
14671468
Name of the BigQuery connection. You should either have the
14681469
connection already created in the `location` you have chosen, or
14691470
you should have the Project IAM Admin role to enable the service
1470-
to create the connection for you if you need it.If this parameter is
1471+
to create the connection for you if you need it. If this parameter is
14711472
not provided then the BigQuery connection from the session is used.
14721473
reuse (bool, Optional):
14731474
Reuse the remote function if already exists.
@@ -1482,6 +1483,10 @@ def remote_function(
14821483
caution, because two users working in the same project and dataset
14831484
could overwrite each other's remote functions if they use the same
14841485
persistent name.
1486+
packages (str[], Optional):
1487+
Explicit name of the external package dependencies. Each dependency
1488+
is added to the `requirements.txt` as is, and can be of the form
1489+
supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/.
14851490
Returns:
14861491
callable: A remote function object pointing to the cloud assets created
14871492
in the background to support the remote execution. The cloud assets can be
@@ -1499,6 +1504,7 @@ def remote_function(
14991504
bigquery_connection=bigquery_connection,
15001505
reuse=reuse,
15011506
name=name,
1507+
packages=packages,
15021508
)
15031509

15041510
def read_gbq_function(

‎samples/snippets/remote_function.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -89,19 +89,25 @@ def get_bucket(num):
8989
# say we consider the `species`, `island` and `sex` of the penguins
9090
# sensitive information and want to redact that by replacing with their hash
9191
# code instead. Let's define another scalar custom function and decorate it
92-
# as a remote function
92+
# as a remote function. The custom function in this example has external
93+
# package dependency, which can be specified via `packages` parameter.
9394
@bpd.remote_function(
94-
[str], str, bigquery_connection="bigframes-rf-conn", reuse=False
95+
[str],
96+
str,
97+
bigquery_connection="bigframes-rf-conn",
98+
reuse=False,
99+
packages=["cryptography"],
95100
)
96101
def get_hash(input):
97-
import hashlib
102+
from cryptography.fernet import Fernet
98103

99104
# handle missing value
100105
if input is None:
101106
input = ""
102-
encoded_input = input.encode()
103-
hash = hashlib.md5(encoded_input)
104-
return hash.hexdigest()
107+
108+
key = Fernet.generate_key()
109+
f = Fernet(key)
110+
return f.encrypt(input.encode()).decode()
105111

106112
# We can use this remote function in another `pandas`-like API `map` that
107113
# can be applied on a DataFrame

‎tests/system/large/test_remote_function.py

+45
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,51 @@ def square(x):
916916
)
917917

918918

919+
@pytest.mark.flaky(retries=2, delay=120)
920+
def test_remote_function_with_external_package_dependencies(
921+
session, scalars_dfs, dataset_id, bq_cf_connection, functions_client
922+
):
923+
try:
924+
925+
def pd_np_foo(x):
926+
import numpy as mynp
927+
import pandas as mypd
928+
929+
return mypd.Series([x, mynp.sqrt(mynp.abs(x))]).sum()
930+
931+
# Create the remote function with the name provided explicitly
932+
pd_np_foo_remote = session.remote_function(
933+
[int],
934+
float,
935+
dataset_id,
936+
bq_cf_connection,
937+
reuse=False,
938+
packages=["numpy", "pandas >= 2.0.0"],
939+
)(pd_np_foo)
940+
941+
# The behavior of the created remote function should be as expected
942+
scalars_df, scalars_pandas_df = scalars_dfs
943+
944+
bf_int64_col = scalars_df["int64_too"]
945+
bf_result_col = bf_int64_col.apply(pd_np_foo_remote)
946+
bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas()
947+
948+
pd_int64_col = scalars_pandas_df["int64_too"]
949+
pd_result_col = pd_int64_col.apply(pd_np_foo)
950+
pd_result = pd_int64_col.to_frame().assign(result=pd_result_col)
951+
952+
# pandas result is non-nullable type float64, make it Float64 before
953+
# comparing for the purpose of this test
954+
pd_result.result = pd_result.result.astype(pandas.Float64Dtype())
955+
956+
assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
957+
finally:
958+
# clean up the gcp assets created for the remote function
959+
cleanup_remote_function_assets(
960+
session.bqclient, functions_client, pd_np_foo_remote
961+
)
962+
963+
919964
@pytest.mark.flaky(retries=2, delay=120)
920965
def test_remote_function_with_explicit_name_reuse(
921966
session, scalars_dfs, dataset_id, bq_cf_connection, functions_client

0 commit comments

Comments
 (0)
Failed to load comments.