Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5ba4511

Browse files
sycaigcf-owl-bot[bot]
andauthoredJan 3, 2025
feat: implement confirmation threshold for semantic operators (#1251)
* feat: implement confirmation threshold for semantic operators * fix format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add doc * raise exception when the user didn't confirm to proceed * fix prompt format * add sem ops autofail option * fix doc * use option_context to set options in tests * remove redundant code * fix tests * fix doctest --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 7f8c972 commit 5ba4511

File tree

5 files changed

+881
-313
lines changed

5 files changed

+881
-313
lines changed
 

‎bigframes/_config/compute_options.py

+10
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,23 @@ class ComputeOptions:
6666
engine to handle. However this comes at the cost of increase cost and latency.
6767
extra_query_labels (Dict[str, Any], Options):
6868
Stores additional custom labels for query configuration.
69+
semmantic_ops_confirmation_threshold (int, optional):
70+
Guards against unexepcted processing of large amount of rows by semantic operators.
71+
If the number of rows exceeds the threshold, the user will be asked to confirm
72+
their operations to resume. The default value is 0. Set the value to None
73+
to turn off the guard.
74+
semantic_ops_threshold_autofail (bool):
75+
Guards against unexepcted processing of large amount of rows by semantic operators.
76+
When set to True, the operation automatically fails without asking for user inputs.
6977
"""
7078

7179
maximum_bytes_billed: Optional[int] = None
7280
enable_multi_query_execution: bool = False
7381
extra_query_labels: Dict[str, Any] = dataclasses.field(
7482
default_factory=dict, init=False
7583
)
84+
semantic_ops_confirmation_threshold: Optional[int] = 0
85+
semantic_ops_threshold_autofail = False
7686

7787
def assign_extra_query_labels(self, **kwargs: Any) -> None:
7888
"""

‎bigframes/exceptions.py

+4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ class QueryComplexityError(RuntimeError):
5959
"""Query plan is too complex to execute."""
6060

6161

62+
class OperationAbortedError(RuntimeError):
63+
"""Operation is aborted."""
64+
65+
6266
class TimeTravelDisabledWarning(Warning):
6367
"""A query was reattempted without time travel."""
6468

‎bigframes/operations/semantics.py

+62-18
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020

2121
import numpy as np
2222

23-
import bigframes.core.guid as guid
24-
import bigframes.dtypes as dtypes
23+
from bigframes import dtypes, exceptions
24+
from bigframes.core import guid
2525

2626

2727
class Semantics:
@@ -53,6 +53,7 @@ def agg(
5353
>>> import bigframes.pandas as bpd
5454
>>> bpd.options.display.progress_bar = None
5555
>>> bpd.options.experiments.semantic_operators = True
56+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
5657
5758
>>> import bigframes.ml.llm as llm
5859
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -115,6 +116,15 @@ def agg(
115116
self._validate_model(model)
116117
columns = self._parse_columns(instruction)
117118

119+
if max_agg_rows <= 1:
120+
raise ValueError(
121+
f"Invalid value for `max_agg_rows`: {max_agg_rows}."
122+
"It must be greater than 1."
123+
)
124+
125+
work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1))
126+
self._confirm_operation(work_estimate)
127+
118128
df: bigframes.dataframe.DataFrame = self._df.copy()
119129
for column in columns:
120130
if column not in self._df.columns:
@@ -135,12 +145,6 @@ def agg(
135145
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
136146
)
137147

138-
if max_agg_rows <= 1:
139-
raise ValueError(
140-
f"Invalid value for `max_agg_rows`: {max_agg_rows}."
141-
"It must be greater than 1."
142-
)
143-
144148
user_instruction = self._format_instruction(instruction, columns)
145149

146150
num_cluster = 1
@@ -243,6 +247,7 @@ def cluster_by(
243247
>>> import bigframes.pandas as bpd
244248
>>> bpd.options.display.progress_bar = None
245249
>>> bpd.options.experiments.semantic_operators = True
250+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
246251
247252
>>> import bigframes.ml.llm as llm
248253
>>> model = llm.TextEmbeddingGenerator()
@@ -296,6 +301,8 @@ def cluster_by(
296301
"It must be greater than 1."
297302
)
298303

304+
self._confirm_operation(len(self._df))
305+
299306
df: bigframes.dataframe.DataFrame = self._df.copy()
300307
embeddings_df = model.predict(df[column])
301308

@@ -314,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
314321
>>> import bigframes.pandas as bpd
315322
>>> bpd.options.display.progress_bar = None
316323
>>> bpd.options.experiments.semantic_operators = True
324+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
317325
318326
>>> import bigframes.ml.llm as llm
319327
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -367,6 +375,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
367375
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
368376
)
369377

378+
self._confirm_operation(len(self._df))
379+
370380
df: bigframes.dataframe.DataFrame = self._df[columns].copy()
371381
for column in columns:
372382
if df[column].dtype != dtypes.STRING_DTYPE:
@@ -403,6 +413,7 @@ def map(
403413
>>> import bigframes.pandas as bpd
404414
>>> bpd.options.display.progress_bar = None
405415
>>> bpd.options.experiments.semantic_operators = True
416+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
406417
407418
>>> import bigframes.ml.llm as llm
408419
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -462,6 +473,8 @@ def map(
462473
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
463474
)
464475

476+
self._confirm_operation(len(self._df))
477+
465478
df: bigframes.dataframe.DataFrame = self._df[columns].copy()
466479
for column in columns:
467480
if df[column].dtype != dtypes.STRING_DTYPE:
@@ -490,7 +503,6 @@ def join(
490503
other,
491504
instruction: str,
492505
model,
493-
max_rows: int = 1000,
494506
ground_with_google_search: bool = False,
495507
):
496508
"""
@@ -502,6 +514,7 @@ def join(
502514
>>> import bigframes.pandas as bpd
503515
>>> bpd.options.display.progress_bar = None
504516
>>> bpd.options.experiments.semantic_operators = True
517+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
505518
506519
>>> import bigframes.ml.llm as llm
507520
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -561,12 +574,8 @@ def join(
561574
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
562575
)
563576

564-
joined_table_rows = len(self._df) * len(other)
565-
566-
if joined_table_rows > max_rows:
567-
raise ValueError(
568-
f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}."
569-
)
577+
work_estimate = len(self._df) * len(other)
578+
self._confirm_operation(work_estimate)
570579

571580
left_columns = []
572581
right_columns = []
@@ -645,6 +654,7 @@ def search(
645654
646655
>>> import bigframes
647656
>>> bigframes.options.experiments.semantic_operators = True
657+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
648658
649659
>>> import bigframes.ml.llm as llm
650660
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
@@ -680,6 +690,8 @@ def search(
680690
if search_column not in self._df.columns:
681691
raise ValueError(f"Column `{search_column}` not found")
682692

693+
self._confirm_operation(len(self._df))
694+
683695
import bigframes.ml.llm as llm
684696

685697
if not isinstance(model, llm.TextEmbeddingGenerator):
@@ -743,6 +755,7 @@ def top_k(
743755
>>> import bigframes.pandas as bpd
744756
>>> bpd.options.display.progress_bar = None
745757
>>> bpd.options.experiments.semantic_operators = True
758+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
746759
747760
>>> import bigframes.ml.llm as llm
748761
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -803,6 +816,9 @@ def top_k(
803816
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
804817
)
805818

819+
work_estimate = int(len(self._df) * (len(self._df) - 1) / 2)
820+
self._confirm_operation(work_estimate)
821+
806822
df: bigframes.dataframe.DataFrame = self._df[columns].copy()
807823
column = columns[0]
808824
if df[column].dtype != dtypes.STRING_DTYPE:
@@ -940,9 +956,8 @@ def sim_join(
940956
941957
>>> import bigframes.pandas as bpd
942958
>>> bpd.options.display.progress_bar = None
943-
944-
>>> import bigframes
945-
>>> bigframes.options.experiments.semantic_operators = True
959+
>>> bpd.options.experiments.semantic_operators = True
960+
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
946961
947962
>>> import bigframes.ml.llm as llm
948963
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
@@ -1001,6 +1016,9 @@ def sim_join(
10011016
if top_k < 1:
10021017
raise ValueError("top_k must be an integer greater than or equal to 1.")
10031018

1019+
work_estimate = len(self._df) * len(other)
1020+
self._confirm_operation(work_estimate)
1021+
10041022
base_table_embedding_column = guid.generate_guid()
10051023
base_table = self._attach_embedding(
10061024
other, right_on, base_table_embedding_column, model
@@ -1072,3 +1090,29 @@ def _validate_model(model):
10721090

10731091
if not isinstance(model, GeminiTextGenerator):
10741092
raise TypeError("Model is not GeminiText Generator")
1093+
1094+
@staticmethod
1095+
def _confirm_operation(row_count: int):
1096+
"""Raises OperationAbortedError when the confirmation fails"""
1097+
import bigframes
1098+
1099+
threshold = bigframes.options.compute.semantic_ops_confirmation_threshold
1100+
1101+
if threshold is None or row_count <= threshold:
1102+
return
1103+
1104+
if bigframes.options.compute.semantic_ops_threshold_autofail:
1105+
raise exceptions.OperationAbortedError(
1106+
f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows."
1107+
)
1108+
1109+
# Separate the prompt out. In IDE such VS Code, leaving prompt in the
1110+
# input function makes it less visible to the end user.
1111+
print(f"This operation will process about {row_count} rows.")
1112+
print(
1113+
"You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`."
1114+
)
1115+
print("Proceed? [Y/n]")
1116+
reply = input().casefold()
1117+
if reply not in {"y", "yes", ""}:
1118+
raise exceptions.OperationAbortedError("Operation was cancelled.")
There was a problem loading the remainder of the diff.

0 commit comments

Comments
 (0)
Failed to load comments.