googleapis · Jan 3, 2025
diff --git a/‎bigframes/_config/compute_options.py
+10 b/‎bigframes/_config/compute_options.py
+10
diff --git a/‎bigframes/exceptions.py
+4 b/‎bigframes/exceptions.py
+4
diff --git a/‎bigframes/operations/semantics.py
+62-18 b/‎bigframes/operations/semantics.py
+62-18
@@ -66,13 +66,23 @@ class ComputeOptions:
             engine to handle. However this comes at the cost of increase cost and latency.
         extra_query_labels (Dict[str, Any], Options):
             Stores additional custom labels for query configuration.
+        semmantic_ops_confirmation_threshold (int, optional):
+            Guards against unexepcted processing of large amount of rows by semantic operators.
+            If the number of rows exceeds the threshold, the user will be asked to confirm
+            their operations to resume. The default value is 0. Set the value to None
+            to turn off the guard.
+        semantic_ops_threshold_autofail (bool):
+            Guards against unexepcted processing of large amount of rows by semantic operators.
+            When set to True, the operation automatically fails without asking for user inputs.
     """
 
     maximum_bytes_billed: Optional[int] = None
     enable_multi_query_execution: bool = False
     extra_query_labels: Dict[str, Any] = dataclasses.field(
         default_factory=dict, init=False
     )
+    semantic_ops_confirmation_threshold: Optional[int] = 0
+    semantic_ops_threshold_autofail = False
 
     def assign_extra_query_labels(self, **kwargs: Any) -> None:
         """
 
@@ -59,6 +59,10 @@ class QueryComplexityError(RuntimeError):
     """Query plan is too complex to execute."""
 
 
+class OperationAbortedError(RuntimeError):
+    """Operation is aborted."""
+
+
 class TimeTravelDisabledWarning(Warning):
     """A query was reattempted without time travel."""
 
 
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-import bigframes.core.guid as guid
-import bigframes.dtypes as dtypes
+from bigframes import dtypes, exceptions
+from bigframes.core import guid
 
 
 class Semantics:
@@ -53,6 +53,7 @@ def agg(
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -115,6 +116,15 @@ def agg(
         self._validate_model(model)
         columns = self._parse_columns(instruction)
 
+        if max_agg_rows <= 1:
+            raise ValueError(
+                f"Invalid value for `max_agg_rows`: {max_agg_rows}."
+                "It must be greater than 1."
+            )
+
+        work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1))
+        self._confirm_operation(work_estimate)
+
         df: bigframes.dataframe.DataFrame = self._df.copy()
         for column in columns:
             if column not in self._df.columns:
@@ -135,12 +145,6 @@ def agg(
                 "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
             )
 
-        if max_agg_rows <= 1:
-            raise ValueError(
-                f"Invalid value for `max_agg_rows`: {max_agg_rows}."
-                "It must be greater than 1."
-            )
-
         user_instruction = self._format_instruction(instruction, columns)
 
         num_cluster = 1
@@ -243,6 +247,7 @@ def cluster_by(
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.TextEmbeddingGenerator()
@@ -296,6 +301,8 @@ def cluster_by(
                 "It must be greater than 1."
             )
 
+        self._confirm_operation(len(self._df))
+
         df: bigframes.dataframe.DataFrame = self._df.copy()
         embeddings_df = model.predict(df[column])
 
@@ -314,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -367,6 +375,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
                 "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
             )
 
+        self._confirm_operation(len(self._df))
+
         df: bigframes.dataframe.DataFrame = self._df[columns].copy()
         for column in columns:
             if df[column].dtype != dtypes.STRING_DTYPE:
@@ -403,6 +413,7 @@ def map(
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -462,6 +473,8 @@ def map(
                 "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
             )
 
+        self._confirm_operation(len(self._df))
+
         df: bigframes.dataframe.DataFrame = self._df[columns].copy()
         for column in columns:
             if df[column].dtype != dtypes.STRING_DTYPE:
@@ -490,7 +503,6 @@ def join(
         other,
         instruction: str,
         model,
-        max_rows: int = 1000,
         ground_with_google_search: bool = False,
     ):
         """
@@ -502,6 +514,7 @@ def join(
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -561,12 +574,8 @@ def join(
                 "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
             )
 
-        joined_table_rows = len(self._df) * len(other)
-
-        if joined_table_rows > max_rows:
-            raise ValueError(
-                f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}."
-            )
+        work_estimate = len(self._df) * len(other)
+        self._confirm_operation(work_estimate)
 
         left_columns = []
         right_columns = []
@@ -645,6 +654,7 @@ def search(
 
             >>> import bigframes
             >>> bigframes.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
@@ -680,6 +690,8 @@ def search(
         if search_column not in self._df.columns:
             raise ValueError(f"Column `{search_column}` not found")
 
+        self._confirm_operation(len(self._df))
+
         import bigframes.ml.llm as llm
 
         if not isinstance(model, llm.TextEmbeddingGenerator):
@@ -743,6 +755,7 @@ def top_k(
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -803,6 +816,9 @@ def top_k(
                 "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
             )
 
+        work_estimate = int(len(self._df) * (len(self._df) - 1) / 2)
+        self._confirm_operation(work_estimate)
+
         df: bigframes.dataframe.DataFrame = self._df[columns].copy()
         column = columns[0]
         if df[column].dtype != dtypes.STRING_DTYPE:
@@ -940,9 +956,8 @@ def sim_join(
 
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
-
-            >>> import bigframes
-            >>> bigframes.options.experiments.semantic_operators = True
+            >>> bpd.options.experiments.semantic_operators = True
+            >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
 
             >>> import bigframes.ml.llm as llm
             >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
@@ -1001,6 +1016,9 @@ def sim_join(
         if top_k < 1:
             raise ValueError("top_k must be an integer greater than or equal to 1.")
 
+        work_estimate = len(self._df) * len(other)
+        self._confirm_operation(work_estimate)
+
         base_table_embedding_column = guid.generate_guid()
         base_table = self._attach_embedding(
             other, right_on, base_table_embedding_column, model
@@ -1072,3 +1090,29 @@ def _validate_model(model):
 
         if not isinstance(model, GeminiTextGenerator):
             raise TypeError("Model is not GeminiText Generator")
+
+    @staticmethod
+    def _confirm_operation(row_count: int):
+        """Raises OperationAbortedError when the confirmation fails"""
+        import bigframes
+
+        threshold = bigframes.options.compute.semantic_ops_confirmation_threshold
+
+        if threshold is None or row_count <= threshold:
+            return
+
+        if bigframes.options.compute.semantic_ops_threshold_autofail:
+            raise exceptions.OperationAbortedError(
+                f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows."
+            )
+
+        # Separate the prompt out. In IDE such VS Code, leaving prompt in the
+        # input function makes it less visible to the end user.
+        print(f"This operation will process about {row_count} rows.")
+        print(
+            "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`."
+        )
+        print("Proceed? [Y/n]")
+        reply = input().casefold()
+        if reply not in {"y", "yes", ""}:
+            raise exceptions.OperationAbortedError("Operation was cancelled.")