googleapis · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,30 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [1.19.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.18.0...v1.19.0) (2024-09-24)
+
+
+### Features
+
+* Add ml.model_selection.KFold class ([#1001](https://github.com/googleapis/python-bigquery-dataframes/issues/1001)) ([952cab9](https://github.com/googleapis/python-bigquery-dataframes/commit/952cab92e548b70d077b20bf10f5307751d2ae76))
+* Support bool and bytes types in `describe(include='all')` ([#994](https://github.com/googleapis/python-bigquery-dataframes/issues/994)) ([cc48f58](https://github.com/googleapis/python-bigquery-dataframes/commit/cc48f58cbd94f8110ee863eb57d3fe8dc5a17778))
+* Support ingress settings in `remote_function` ([#1011](https://github.com/googleapis/python-bigquery-dataframes/issues/1011)) ([8e9919b](https://github.com/googleapis/python-bigquery-dataframes/commit/8e9919b53899b6951a10d02643d1d0e53e15665f))
+
+
+### Bug Fixes
+
+* Fix miscasting issues with case_when ([#1003](https://github.com/googleapis/python-bigquery-dataframes/issues/1003)) ([038139d](https://github.com/googleapis/python-bigquery-dataframes/commit/038139dfa4fa89167c52c1cb559c2eb5fe2f0411))
+
+
+### Performance Improvements
+
+* Join op discards child ordering in unordered mode ([#923](https://github.com/googleapis/python-bigquery-dataframes/issues/923)) ([1b5b0ee](https://github.com/googleapis/python-bigquery-dataframes/commit/1b5b0eea92631b7dd1b688cf1da617fc7ce862dc))
+
+
+### Dependencies
+
+* Update ibis version in prerelease tests ([#1012](https://github.com/googleapis/python-bigquery-dataframes/issues/1012)) ([f89785f](https://github.com/googleapis/python-bigquery-dataframes/commit/f89785fcfc51c541253ca8c1e8baf80fbfaea3b6))
+
 ## [1.18.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.17.0...v1.18.0) (2024-09-18)
 
 

diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -441,6 +441,10 @@ def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR:
             columns=columns,
         )
 
+    def as_ordered_ir(self) -> OrderedIR:
+        """Convert to OrderedIr, but without any definite ordering."""
+        return OrderedIR(self._table, self._columns, predicates=self._predicates)
+
     ## Helpers
     def _set_or_replace_by_id(
         self, id: str, new_value: ibis_types.Value

diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -76,14 +76,27 @@ def _compile_node(
     @_compile_node.register
     def compile_join(self, node: nodes.JoinNode, ordered: bool = True):
         if ordered:
-            left_ordered = self.compile_ordered_ir(node.left_child)
-            right_ordered = self.compile_ordered_ir(node.right_child)
-            return bigframes.core.compile.single_column.join_by_column_ordered(
-                left=left_ordered,
-                right=right_ordered,
-                type=node.type,
-                conditions=node.conditions,
-            )
+            # In general, joins are an ordering destroying operation.
+            # With ordering_mode = "partial", make this explicit. In
+            # this case, we don't need to provide a deterministic ordering.
+            if self.strict:
+                left_ordered = self.compile_ordered_ir(node.left_child)
+                right_ordered = self.compile_ordered_ir(node.right_child)
+                return bigframes.core.compile.single_column.join_by_column_ordered(
+                    left=left_ordered,
+                    right=right_ordered,
+                    type=node.type,
+                    conditions=node.conditions,
+                )
+            else:
+                left_unordered = self.compile_unordered_ir(node.left_child)
+                right_unordered = self.compile_unordered_ir(node.right_child)
+                return bigframes.core.compile.single_column.join_by_column_unordered(
+                    left=left_unordered,
+                    right=right_unordered,
+                    type=node.type,
+                    conditions=node.conditions,
+                ).as_ordered_ir()
         else:
             left_unordered = self.compile_unordered_ir(node.left_child)
             right_unordered = self.compile_unordered_ir(node.right_child)

diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
@@ -25,7 +25,9 @@
 import bigframes.operations.aggregations as agg_ops
 
 
-def const(value: typing.Hashable, dtype: dtypes.ExpressionType = None) -> Expression:
+def const(
+    value: typing.Hashable, dtype: dtypes.ExpressionType = None
+) -> ScalarConstantExpression:
     return ScalarConstantExpression(value, dtype or dtypes.infer_literal_type(value))
 
 
@@ -141,6 +143,9 @@ class ScalarConstantExpression(Expression):
     def is_const(self) -> bool:
         return True
 
+    def rename(self, name_mapping: Mapping[str, str]) -> ScalarConstantExpression:
+        return self
+
     def output_type(
         self, input_types: dict[str, bigframes.dtypes.Dtype]
     ) -> dtypes.ExpressionType:
@@ -167,7 +172,7 @@ class UnboundVariableExpression(Expression):
     def unbound_variables(self) -> typing.Tuple[str, ...]:
         return (self.id,)
 
-    def rename(self, name_mapping: Mapping[str, str]) -> Expression:
+    def rename(self, name_mapping: Mapping[str, str]) -> UnboundVariableExpression:
         if self.id in name_mapping:
             return UnboundVariableExpression(name_mapping[self.id])
         else:

diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -249,6 +249,7 @@ def order_ambiguous(self) -> bool:
 
     @property
     def explicitly_ordered(self) -> bool:
+        # Do not consider user pre-join ordering intent - they need to re-order post-join in unordered mode.
         return False
 
     def __hash__(self):
@@ -307,7 +308,8 @@ def order_ambiguous(self) -> bool:
 
     @property
     def explicitly_ordered(self) -> bool:
-        return all(child.explicitly_ordered for child in self.children)
+        # Consider concat as an ordered operations (even though input frames may not be ordered)
+        return True
 
     def __hash__(self):
         return self._node_hash

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2303,7 +2303,7 @@ def melt(
             self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
         )
 
-    _NUMERICAL_DISCRIBE_AGGS = (
+    _NUMERIC_DESCRIBE_AGGS = (
         "count",
         "mean",
         "std",
@@ -2313,41 +2313,53 @@ def melt(
         "75%",
         "max",
     )
-    _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
+    _NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique")
 
     def describe(self, include: None | Literal["all"] = None) -> DataFrame:
+
+        allowed_non_numeric_types = {
+            bigframes.dtypes.STRING_DTYPE,
+            bigframes.dtypes.BOOL_DTYPE,
+            bigframes.dtypes.BYTES_DTYPE,
+        }
+
         if include is None:
             numeric_df = self._drop_non_numeric(permissive=False)
             if len(numeric_df.columns) == 0:
-                # Describe eligible non-numerical columns
-                result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
+                # Describe eligible non-numeric columns
+                result = self.select_dtypes(include=allowed_non_numeric_types).agg(
+                    self._NON_NUMERIC_DESCRIBE_AGGS
+                )
             else:
-                # Otherwise, only describe numerical columns
-                result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
+                # Otherwise, only describe numeric columns
+                result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS)
             return typing.cast(DataFrame, result)
 
         elif include == "all":
             numeric_result = typing.cast(
                 DataFrame,
                 self._drop_non_numeric(permissive=False).agg(
-                    self._NUMERICAL_DISCRIBE_AGGS
+                    self._NUMERIC_DESCRIBE_AGGS
                 ),
             )
-            string_result = typing.cast(
+
+            non_numeric_result = typing.cast(
                 DataFrame,
-                self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
+                self.select_dtypes(include=allowed_non_numeric_types).agg(
+                    self._NON_NUMERIC_DESCRIBE_AGGS
+                ),
             )
 
             if len(numeric_result.columns) == 0:
-                return string_result
-            elif len(string_result.columns) == 0:
+                return non_numeric_result
+            elif len(non_numeric_result.columns) == 0:
                 return numeric_result
             else:
                 import bigframes.core.reshape as rs
 
                 # Use reindex after join to preserve the original column order.
                 return rs.concat(
-                    [numeric_result, string_result], axis=1
+                    [non_numeric_result, numeric_result], axis=1
                 )._reindex_columns(self.columns)
 
         else:
@@ -2549,26 +2561,18 @@ def unstack(self, level: LevelsType = -1):
         return DataFrame(pivot_block)
 
     def _drop_non_numeric(self, permissive=True) -> DataFrame:
-        numerical_types = (
+        numeric_types = (
             set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
         non_numeric_cols = [
             col_id
             for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype not in numerical_types
+            if dtype not in numeric_types
         ]
         return DataFrame(self._block.drop_columns(non_numeric_cols))
 
-    def _drop_non_string(self) -> DataFrame:
-        string_cols = [
-            col_id
-            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype == bigframes.dtypes.STRING_DTYPE
-        ]
-        return DataFrame(self._block.select_columns(string_cols))
-
     def _drop_non_bool(self) -> DataFrame:
         non_bool_cols = [
             col_id
@@ -3469,11 +3473,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
             raise ValueError(f"na_action={na_action} not supported")
 
         # TODO(shobs): Support **kwargs
-        # Reproject as workaround to applying filter too late. This forces the
-        # filter to be applied before passing data to remote function,
-        # protecting from bad inputs causing errors.
-        reprojected_df = DataFrame(self._block._force_reproject())
-        return reprojected_df._apply_unary_op(
+        return self._apply_unary_op(
             ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None))
         )
 
@@ -3568,13 +3568,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
                     )
 
                 series_list = [self[col] for col in self.columns]
-                # Reproject as workaround to applying filter too late. This forces the
-                # filter to be applied before passing data to remote function,
-                # protecting from bad inputs causing errors.
-                reprojected_series = bigframes.series.Series(
-                    series_list[0]._block._force_reproject()
-                )
-                result_series = reprojected_series._apply_nary_op(
+                result_series = series_list[0]._apply_nary_op(
                     ops.NaryRemoteFunctionOp(func=func), series_list[1:]
                 )
             result_series.name = None

diff --git a/bigframes/functions/_remote_function_client.py b/bigframes/functions/_remote_function_client.py
@@ -23,6 +23,7 @@
 import string
 import sys
 import tempfile
+import types
 from typing import cast, Tuple, TYPE_CHECKING
 
 from bigframes_vendored import constants
@@ -43,6 +44,15 @@
 
 logger = logging.getLogger(__name__)
 
+# https://cloud.google.com/sdk/gcloud/reference/functions/deploy#--ingress-settings
+_INGRESS_SETTINGS_MAP = types.MappingProxyType(
+    {
+        "all": functions_v2.ServiceConfig.IngressSettings.ALLOW_ALL,
+        "internal-only": functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_ONLY,
+        "internal-and-gclb": functions_v2.ServiceConfig.IngressSettings.ALLOW_INTERNAL_AND_GCLB,
+    }
+)
+
 
 class RemoteFunctionClient:
     # Wait time (in seconds) for an IAM binding to take effect after creation
@@ -228,6 +238,7 @@ def create_cloud_function(
         is_row_processor=False,
         vpc_connector=None,
         memory_mib=1024,
+        ingress_settings="all",
     ):
         """Create a cloud function from the given user defined function.
 
@@ -324,6 +335,16 @@ def create_cloud_function(
             function.service_config.service_account_email = (
                 self._cloud_function_service_account
             )
+            if ingress_settings not in _INGRESS_SETTINGS_MAP:
+                raise ValueError(
+                    "'{}' not one of the supported ingress settings values: {}".format(
+                        ingress_settings, list(_INGRESS_SETTINGS_MAP)
+                    )
+                )
+            function.service_config.ingress_settings = cast(
+                functions_v2.ServiceConfig.IngressSettings,
+                _INGRESS_SETTINGS_MAP[ingress_settings],
+            )
             function.kms_key_name = self._cloud_function_kms_key_name
             create_function_request.function = function
 
@@ -372,6 +393,7 @@ def provision_bq_remote_function(
         is_row_processor,
         cloud_function_vpc_connector,
         cloud_function_memory_mib,
+        cloud_function_ingress_settings,
     ):
         """Provision a BigQuery remote function."""
         # Augment user package requirements with any internal package
@@ -418,6 +440,7 @@ def provision_bq_remote_function(
                 is_row_processor=is_row_processor,
                 vpc_connector=cloud_function_vpc_connector,
                 memory_mib=cloud_function_memory_mib,
+                ingress_settings=cloud_function_ingress_settings,
             )
         else:
             logger.info(f"Cloud function {cloud_function_name} already exists.")

diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_remote_function_session.py
@@ -19,7 +19,17 @@
 import inspect
 import sys
 import threading
-from typing import Any, cast, Dict, Mapping, Optional, Sequence, TYPE_CHECKING, Union
+from typing import (
+    Any,
+    cast,
+    Dict,
+    Literal,
+    Mapping,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Union,
+)
 import warnings
 
 import bigframes_vendored.constants as constants
@@ -110,6 +120,9 @@ def remote_function(
         cloud_function_max_instances: Optional[int] = None,
         cloud_function_vpc_connector: Optional[str] = None,
         cloud_function_memory_mib: Optional[int] = 1024,
+        cloud_function_ingress_settings: Literal[
+            "all", "internal-only", "internal-and-gclb"
+        ] = "all",
     ):
         """Decorator to turn a user defined function into a BigQuery remote function.
 
@@ -280,6 +293,11 @@ def remote_function(
                 default memory of cloud functions be allocated, pass `None`. See
                 for more details
                 https://cloud.google.com/functions/docs/configuring/memory.
+            cloud_function_ingress_settings (str, Optional):
+                Ingress settings controls dictating what traffic can reach the
+                function. By default `all` will be used. It must be one of:
+                `all`, `internal-only`, `internal-and-gclb`. See for more details
+                https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings.
         """
         # Some defaults may be used from the session if not provided otherwise
         import bigframes.exceptions as bf_exceptions
@@ -504,6 +522,7 @@ def try_delattr(attr):
                 is_row_processor=is_row_processor,
                 cloud_function_vpc_connector=cloud_function_vpc_connector,
                 cloud_function_memory_mib=cloud_function_memory_mib,
+                cloud_function_ingress_settings=cloud_function_ingress_settings,
             )
 
             # TODO(shobs): Find a better way to support udfs with param named "name".