googleapis · ashleyxuu · Apr 17, 2024 · Apr 12, 2024 · Apr 17, 2024
@@ -35,7 +35,7 @@ def train_test_split(
     Args:
         *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series):
             A sequence of BigQuery DataFrames or Series that can be joined on
-            their indexes
+            their indexes.
         test_size (default None):
             The proportion of the dataset to include in the test split. If
             None, this will default to the complement of train_size. If both

@@ -37,7 +37,7 @@ def dayofweek(self):
         """The day of the week with Monday=0, Sunday=6.
 
         Return the day of the week. It is assumed the week starts on
-        Monday, which is denoted by 0 and ends on Sunday which is denoted
+        Monday, which is denoted by 0 and ends on Sunday, which is denoted
         by 6.
 
         **Examples:**

@@ -153,7 +153,7 @@ def fit_transform(self, X, y=None):
                 Target values (None for unsupervised transformations).
 
         Returns:
-            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new)
+            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new).
                 Transformed DataFrame.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@@ -122,7 +122,7 @@ def recall_score(
 ):
     """Compute the recall.
 
-    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    The recall is the ratio ``tp / (tp + fn)``, where ``tp`` is the number of
     true positives and ``fn`` the number of false negatives. The recall is
     intuitively the ability of the classifier to find all the positive samples.
 
@@ -170,7 +170,7 @@ def precision_score(
 ):
     """Compute the precision.
 
-    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    The precision is the ratio ``tp / (tp + fp)``, where ``tp`` is the number of
     true positives and ``fp`` the number of false positives. The precision is
     intuitively the ability of the classifier not to label as positive a sample
     that is negative.
@@ -244,9 +244,9 @@ def f1_score(
         dtype: float64
 
     Args:
-        y_true: Series or DataFrame of shape (n_samples,)
+        y_true: Series or DataFrame of shape (n_samples,).
             Ground truth (correct) target values.
-        y_pred: Series or DataFrame of shape (n_samples,)
+        y_pred: Series or DataFrame of shape (n_samples,).
             Estimated targets as returned by a classifier.
         average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
                 default='binary'

@@ -20,13 +20,14 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta):
     """Pipeline of transforms with a final estimator.
 
     Sequentially apply a list of transforms and a final estimator.
-    Intermediate steps of the pipeline must be `transforms`, that is, they
+    Intermediate steps of the pipeline must be `transforms`. That is, they
     must implement `fit` and `transform` methods.
     The final estimator only needs to implement `fit`.
 
     The purpose of the pipeline is to assemble several steps that can be
-    cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator
-    and peprocessing together, e.g. with `Pipeline.to_gbq(...).`
+    cross-validated together while setting different parameters. This
+    simplifies code and allows for deploying an estimator and peprocessing
+    together, e.g. with `Pipeline.to_gbq(...).`
     """
 
     def fit(

@@ -23,15 +23,21 @@ class OneHotEncoder(BaseEstimator):
         Given a dataset with two features, we let the encoder find the unique
         values per feature and transform the data to a binary one-hot encoding.
 
-        .. code-block::
-
-            from bigframes.ml.preprocessing import OneHotEncoder
-            import bigframes.pandas as bpd
-
-            enc = OneHotEncoder()
-            X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]})
-            enc.fit(X)
-            print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]})))
+        >>> from bigframes.ml.preprocessing import OneHotEncoder
+        >>> import bigframes.pandas as bpd
+        >>> bpd.options.display.progress_bar = None
+
+        >>> enc = OneHotEncoder()
+        >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]})
+        >>> enc.fit(X)
+        OneHotEncoder()
+
+        >>> print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]})))
+                        onehotencoded_a               onehotencoded_b
+        0  [{'index': 1, 'value': 1.0}]  [{'index': 1, 'value': 1.0}]
+        1  [{'index': 2, 'value': 1.0}]  [{'index': 0, 'value': 1.0}]
+        <BLANKLINE>
+        [2 rows x 2 columns]
 
     Args:
         drop (Optional[Literal["most_frequent"]], default None):
@@ -52,7 +58,7 @@ class OneHotEncoder(BaseEstimator):
             Specifies an upper limit to the number of output features for each input feature
             when considering infrequent categories. If there are infrequent categories,
             max_categories includes the category representing the infrequent categories along with the frequent categories.
-            Default None, set limit to 1,000,000.
+            Default None. Set limit to 1,000,000.
     """
 
     def fit(self, X, y=None):

@@ -26,7 +26,7 @@ class LabelEncoder(BaseEstimator):
             Specifies an upper limit to the number of output features for each input feature
             when considering infrequent categories. If there are infrequent categories,
             max_categories includes the category representing the infrequent categories along with the frequent categories.
-            Default None, set limit to 1,000,000.
+            Default None. Set limit to 1,000,000.
     """
 
     def fit(self, y):