feat: add ml KMeans model params (#477)

GarrettWu · web-flow · commit 23a8d9a32e16 · 2024-03-21T02:02:18.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py
@@ -17,7 +17,7 @@
 
 from __future__ import annotations
 
-from typing import Dict, List, Optional, Union
+from typing import List, Literal, Optional, Union
 
 import bigframes_vendored.sklearn.cluster._kmeans
 from google.cloud import bigquery
@@ -27,6 +27,16 @@
 from bigframes.ml import base, core, globals, utils
 import bigframes.pandas as bpd
 
+_BQML_PARAMS_MAPPING = {
+    "n_clusters": "numClusters",
+    "init": "kmeansInitializationMethod",
+    "init_col": "kmeansInitializationColumn",
+    "distance_type": "distanceType",
+    "max_iter": "maxIterations",
+    "early_stop": "earlyStop",
+    "tol": "minRelativeProgress",
+}
+
 
 @log_adapter.class_logger
 class KMeans(
@@ -36,30 +46,67 @@ class KMeans(
 
     __doc__ = bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__
 
-    def __init__(self, n_clusters: int = 8):
+    def __init__(
+        self,
+        n_clusters: int = 8,
+        *,
+        init: Literal["kmeans++", "random", "custom"] = "kmeans++",
+        init_col: Optional[str] = None,
+        distance_type: Literal["euclidean", "cosine"] = "euclidean",
+        max_iter: int = 20,
+        tol: float = 0.01,
+        warm_start: bool = False,
+    ):
         self.n_clusters = n_clusters
+        self.init = init
+        self.init_col = init_col
+        self.distance_type = distance_type
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
         self._bqml_model: Optional[core.BqmlModel] = None
         self._bqml_model_factory = globals.bqml_model_factory()
 
     @classmethod
     def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans:
         assert model.model_type == "KMEANS"
 
-        kwargs = {}
+        kwargs: dict = {}
 
         # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
         last_fitting = model.training_runs[-1]["trainingOptions"]
-        if "numClusters" in last_fitting:
-            kwargs["n_clusters"] = int(last_fitting["numClusters"])
+        dummy_kmeans = cls()
+        for bf_param, bf_value in dummy_kmeans.__dict__.items():
+            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
+            if bqml_param in last_fitting:
+                # Convert types
+                kwargs[bf_param] = (
+                    str(last_fitting[bqml_param])
+                    if bf_param in ["init"]
+                    else type(bf_value)(last_fitting[bqml_param])
+                )
 
         new_kmeans = cls(**kwargs)
         new_kmeans._bqml_model = core.BqmlModel(session, model)
         return new_kmeans
 
     @property
-    def _bqml_options(self) -> Dict[str, str | int | float | List[str]]:
+    def _bqml_options(self) -> dict:
         """The model options as they will be set for BQML"""
-        return {"model_type": "KMEANS", "num_clusters": self.n_clusters}
+        options = {
+            "model_type": "KMEANS",
+            "num_clusters": self.n_clusters,
+            "KMEANS_INIT_METHOD": self.init,
+            "DISTANCE_TYPE": self.distance_type,
+            "MAX_ITERATIONS": self.max_iter,
+            "MIN_REL_PROGRESS": self.tol,
+            "WARM_START": self.warm_start,
+        }
+
+        if self.init_col is not None:
+            options["KMEANS_INIT_COL"] = self.init_col
+
+        return options
 
     def _fit(
         self,
diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py
@@ -19,11 +19,11 @@
 from tests.system.utils import assert_pandas_df_equal
 
 
-@pytest.mark.flaky(retries=2, delay=120)
+@pytest.mark.flaky(retries=2)
 def test_cluster_configure_fit_score_predict(
     session, penguins_df_default_index, dataset_id
 ):
-    model = cluster.KMeans(n_clusters=3)
+    model = cluster.KMeans(n_clusters=3, init="random")
 
     df = penguins_df_default_index.dropna()[
         [
@@ -118,3 +118,47 @@ def test_cluster_configure_fit_score_predict(
         in reloaded_model._bqml_model.model_name
     )
     assert reloaded_model.n_clusters == 3
+    assert reloaded_model.init == "RANDOM"
+    assert reloaded_model.distance_type == "EUCLIDEAN"
+    assert reloaded_model.max_iter == 20
+    assert reloaded_model.tol == 0.01
+
+
+def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id):
+    model = cluster.KMeans(
+        n_clusters=4,
+        init="random",
+        distance_type="cosine",
+        max_iter=30,
+        tol=0.001,
+    )
+
+    df = penguins_df_default_index.dropna()[
+        [
+            "culmen_length_mm",
+            "culmen_depth_mm",
+            "flipper_length_mm",
+            "sex",
+        ]
+    ]
+
+    # TODO(swast): How should we handle the default index? Currently, we get:
+    # "Column bigframes_index_0_z is not found in the input data to the
+    # EVALUATE function."
+    df = df.reset_index(drop=True)
+
+    model.fit(df)
+
+    # save, load, check n_clusters to ensure configuration was kept
+    reloaded_model = model.to_gbq(
+        f"{dataset_id}.temp_configured_cluster_model", replace=True
+    )
+    assert (
+        f"{dataset_id}.temp_configured_cluster_model"
+        in reloaded_model._bqml_model.model_name
+    )
+    assert reloaded_model.n_clusters == 4
+    assert reloaded_model.init == "RANDOM"
+    assert reloaded_model.distance_type == "COSINE"
+    assert reloaded_model.max_iter == 30
+    assert reloaded_model.tol == 0.001
diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py
@@ -31,6 +31,34 @@ class KMeans(_BaseKMeans):
         n_clusters (int, default 8):
             The number of clusters to form as well as the number of centroids to generate.
             Default to 8.
+
+        init ("kmeans++", "random" or "custom", default "kmeans++"):
+            The method of initializing the clusters. Default to "kmeans++"
+
+            kmeas++: Initializes a number of centroids equal to the n_clusters value by using the k-means++ algorithm. Using this approach usually trains a better model than using random cluster initialization.
+            random: Initializes the centroids by randomly selecting a number of data points equal to the n_clusters value from the input data.
+            custom: Initializes the centroids using a provided column of type bool. Uses the rows with a value of True as the initial centroids. You specify the column to use by using the init_col option.
+
+        init_col (str or None, default None):
+            The name of the column to use to initialize the centroids. This column must have a type of bool. If this column contains a value of True for a given row, then uses that row as an initial centroid. The number of True rows in this column must be equal to the value you have specified for the n_clusters option.
+            Only works with init method "custom". Default to None.
+
+        distance_type ("euclidean" or "cosine", default "euclidean"):
+            The type of metric to use to compute the distance between two points.
+            Default to "euclidean".
+
+        max_iter (int, default 20):
+            The maximum number of training iterations, where one iteration represents a single pass of the entire training data. Default to 20.
+
+        tol (float, default 0.01):
+            The minimum relative loss improvement that is necessary to continue training. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue.
+            Default to 0.01.
+
+        warm_start (bool, default False):
+            Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run.
+            Default to False.
+
+
     """
 
     def fit(