20
20
21
21
import numpy as np
22
22
23
- import bigframes . core . guid as guid
24
- import bigframes .dtypes as dtypes
23
+ from bigframes import dtypes , exceptions
24
+ from bigframes .core import guid
25
25
26
26
27
27
class Semantics :
@@ -53,6 +53,7 @@ def agg(
53
53
>>> import bigframes.pandas as bpd
54
54
>>> bpd.options.display.progress_bar = None
55
55
>>> bpd.options.experiments.semantic_operators = True
56
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
56
57
57
58
>>> import bigframes.ml.llm as llm
58
59
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -115,6 +116,15 @@ def agg(
115
116
self ._validate_model (model )
116
117
columns = self ._parse_columns (instruction )
117
118
119
+ if max_agg_rows <= 1 :
120
+ raise ValueError (
121
+ f"Invalid value for `max_agg_rows`: { max_agg_rows } ."
122
+ "It must be greater than 1."
123
+ )
124
+
125
+ work_estimate = len (self ._df ) * int (max_agg_rows / (max_agg_rows - 1 ))
126
+ self ._confirm_operation (work_estimate )
127
+
118
128
df : bigframes .dataframe .DataFrame = self ._df .copy ()
119
129
for column in columns :
120
130
if column not in self ._df .columns :
@@ -135,12 +145,6 @@ def agg(
135
145
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
136
146
)
137
147
138
- if max_agg_rows <= 1 :
139
- raise ValueError (
140
- f"Invalid value for `max_agg_rows`: { max_agg_rows } ."
141
- "It must be greater than 1."
142
- )
143
-
144
148
user_instruction = self ._format_instruction (instruction , columns )
145
149
146
150
num_cluster = 1
@@ -243,6 +247,7 @@ def cluster_by(
243
247
>>> import bigframes.pandas as bpd
244
248
>>> bpd.options.display.progress_bar = None
245
249
>>> bpd.options.experiments.semantic_operators = True
250
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
246
251
247
252
>>> import bigframes.ml.llm as llm
248
253
>>> model = llm.TextEmbeddingGenerator()
@@ -296,6 +301,8 @@ def cluster_by(
296
301
"It must be greater than 1."
297
302
)
298
303
304
+ self ._confirm_operation (len (self ._df ))
305
+
299
306
df : bigframes .dataframe .DataFrame = self ._df .copy ()
300
307
embeddings_df = model .predict (df [column ])
301
308
@@ -314,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
314
321
>>> import bigframes.pandas as bpd
315
322
>>> bpd.options.display.progress_bar = None
316
323
>>> bpd.options.experiments.semantic_operators = True
324
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
317
325
318
326
>>> import bigframes.ml.llm as llm
319
327
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -367,6 +375,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
367
375
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
368
376
)
369
377
378
+ self ._confirm_operation (len (self ._df ))
379
+
370
380
df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
371
381
for column in columns :
372
382
if df [column ].dtype != dtypes .STRING_DTYPE :
@@ -403,6 +413,7 @@ def map(
403
413
>>> import bigframes.pandas as bpd
404
414
>>> bpd.options.display.progress_bar = None
405
415
>>> bpd.options.experiments.semantic_operators = True
416
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
406
417
407
418
>>> import bigframes.ml.llm as llm
408
419
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -462,6 +473,8 @@ def map(
462
473
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
463
474
)
464
475
476
+ self ._confirm_operation (len (self ._df ))
477
+
465
478
df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
466
479
for column in columns :
467
480
if df [column ].dtype != dtypes .STRING_DTYPE :
@@ -490,7 +503,6 @@ def join(
490
503
other ,
491
504
instruction : str ,
492
505
model ,
493
- max_rows : int = 1000 ,
494
506
ground_with_google_search : bool = False ,
495
507
):
496
508
"""
@@ -502,6 +514,7 @@ def join(
502
514
>>> import bigframes.pandas as bpd
503
515
>>> bpd.options.display.progress_bar = None
504
516
>>> bpd.options.experiments.semantic_operators = True
517
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
505
518
506
519
>>> import bigframes.ml.llm as llm
507
520
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -561,12 +574,8 @@ def join(
561
574
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
562
575
)
563
576
564
- joined_table_rows = len (self ._df ) * len (other )
565
-
566
- if joined_table_rows > max_rows :
567
- raise ValueError (
568
- f"Number of rows that need processing is { joined_table_rows } , which exceeds row limit { max_rows } ."
569
- )
577
+ work_estimate = len (self ._df ) * len (other )
578
+ self ._confirm_operation (work_estimate )
570
579
571
580
left_columns = []
572
581
right_columns = []
@@ -645,6 +654,7 @@ def search(
645
654
646
655
>>> import bigframes
647
656
>>> bigframes.options.experiments.semantic_operators = True
657
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
648
658
649
659
>>> import bigframes.ml.llm as llm
650
660
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
@@ -680,6 +690,8 @@ def search(
680
690
if search_column not in self ._df .columns :
681
691
raise ValueError (f"Column `{ search_column } ` not found" )
682
692
693
+ self ._confirm_operation (len (self ._df ))
694
+
683
695
import bigframes .ml .llm as llm
684
696
685
697
if not isinstance (model , llm .TextEmbeddingGenerator ):
@@ -743,6 +755,7 @@ def top_k(
743
755
>>> import bigframes.pandas as bpd
744
756
>>> bpd.options.display.progress_bar = None
745
757
>>> bpd.options.experiments.semantic_operators = True
758
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
746
759
747
760
>>> import bigframes.ml.llm as llm
748
761
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
@@ -803,6 +816,9 @@ def top_k(
803
816
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
804
817
)
805
818
819
+ work_estimate = int (len (self ._df ) * (len (self ._df ) - 1 ) / 2 )
820
+ self ._confirm_operation (work_estimate )
821
+
806
822
df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
807
823
column = columns [0 ]
808
824
if df [column ].dtype != dtypes .STRING_DTYPE :
@@ -940,9 +956,8 @@ def sim_join(
940
956
941
957
>>> import bigframes.pandas as bpd
942
958
>>> bpd.options.display.progress_bar = None
943
-
944
- >>> import bigframes
945
- >>> bigframes.options.experiments.semantic_operators = True
959
+ >>> bpd.options.experiments.semantic_operators = True
960
+ >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25
946
961
947
962
>>> import bigframes.ml.llm as llm
948
963
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
@@ -1001,6 +1016,9 @@ def sim_join(
1001
1016
if top_k < 1 :
1002
1017
raise ValueError ("top_k must be an integer greater than or equal to 1." )
1003
1018
1019
+ work_estimate = len (self ._df ) * len (other )
1020
+ self ._confirm_operation (work_estimate )
1021
+
1004
1022
base_table_embedding_column = guid .generate_guid ()
1005
1023
base_table = self ._attach_embedding (
1006
1024
other , right_on , base_table_embedding_column , model
@@ -1072,3 +1090,29 @@ def _validate_model(model):
1072
1090
1073
1091
if not isinstance (model , GeminiTextGenerator ):
1074
1092
raise TypeError ("Model is not GeminiText Generator" )
1093
+
1094
+ @staticmethod
1095
+ def _confirm_operation (row_count : int ):
1096
+ """Raises OperationAbortedError when the confirmation fails"""
1097
+ import bigframes
1098
+
1099
+ threshold = bigframes .options .compute .semantic_ops_confirmation_threshold
1100
+
1101
+ if threshold is None or row_count <= threshold :
1102
+ return
1103
+
1104
+ if bigframes .options .compute .semantic_ops_threshold_autofail :
1105
+ raise exceptions .OperationAbortedError (
1106
+ f"Operation was cancelled because your work estimate is { row_count } rows, which exceeds the threshold { threshold } rows."
1107
+ )
1108
+
1109
+ # Separate the prompt out. In IDE such VS Code, leaving prompt in the
1110
+ # input function makes it less visible to the end user.
1111
+ print (f"This operation will process about { row_count } rows." )
1112
+ print (
1113
+ "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`."
1114
+ )
1115
+ print ("Proceed? [Y/n]" )
1116
+ reply = input ().casefold ()
1117
+ if reply not in {"y" , "yes" , "" }:
1118
+ raise exceptions .OperationAbortedError ("Operation was cancelled." )
0 commit comments