Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 23a8d9a

Browse files
authoredMar 21, 2024
feat: add ml KMeans model params (#477)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 0b3f8e5 commit 23a8d9a

File tree

3 files changed

+128
-9
lines changed

3 files changed

+128
-9
lines changed
 

‎bigframes/ml/cluster.py

+54-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from __future__ import annotations
1919

20-
from typing import Dict, List, Optional, Union
20+
from typing import List, Literal, Optional, Union
2121

2222
import bigframes_vendored.sklearn.cluster._kmeans
2323
from google.cloud import bigquery
@@ -27,6 +27,16 @@
2727
from bigframes.ml import base, core, globals, utils
2828
import bigframes.pandas as bpd
2929

30+
_BQML_PARAMS_MAPPING = {
31+
"n_clusters": "numClusters",
32+
"init": "kmeansInitializationMethod",
33+
"init_col": "kmeansInitializationColumn",
34+
"distance_type": "distanceType",
35+
"max_iter": "maxIterations",
36+
"early_stop": "earlyStop",
37+
"tol": "minRelativeProgress",
38+
}
39+
3040

3141
@log_adapter.class_logger
3242
class KMeans(
@@ -36,30 +46,67 @@ class KMeans(
3646

3747
__doc__ = bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__
3848

39-
def __init__(self, n_clusters: int = 8):
49+
def __init__(
50+
self,
51+
n_clusters: int = 8,
52+
*,
53+
init: Literal["kmeans++", "random", "custom"] = "kmeans++",
54+
init_col: Optional[str] = None,
55+
distance_type: Literal["euclidean", "cosine"] = "euclidean",
56+
max_iter: int = 20,
57+
tol: float = 0.01,
58+
warm_start: bool = False,
59+
):
4060
self.n_clusters = n_clusters
61+
self.init = init
62+
self.init_col = init_col
63+
self.distance_type = distance_type
64+
self.max_iter = max_iter
65+
self.tol = tol
66+
self.warm_start = warm_start
4167
self._bqml_model: Optional[core.BqmlModel] = None
4268
self._bqml_model_factory = globals.bqml_model_factory()
4369

4470
@classmethod
4571
def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans:
4672
assert model.model_type == "KMEANS"
4773

48-
kwargs = {}
74+
kwargs: dict = {}
4975

5076
# See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun
5177
last_fitting = model.training_runs[-1]["trainingOptions"]
52-
if "numClusters" in last_fitting:
53-
kwargs["n_clusters"] = int(last_fitting["numClusters"])
78+
dummy_kmeans = cls()
79+
for bf_param, bf_value in dummy_kmeans.__dict__.items():
80+
bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
81+
if bqml_param in last_fitting:
82+
# Convert types
83+
kwargs[bf_param] = (
84+
str(last_fitting[bqml_param])
85+
if bf_param in ["init"]
86+
else type(bf_value)(last_fitting[bqml_param])
87+
)
5488

5589
new_kmeans = cls(**kwargs)
5690
new_kmeans._bqml_model = core.BqmlModel(session, model)
5791
return new_kmeans
5892

5993
@property
60-
def _bqml_options(self) -> Dict[str, str | int | float | List[str]]:
94+
def _bqml_options(self) -> dict:
6195
"""The model options as they will be set for BQML"""
62-
return {"model_type": "KMEANS", "num_clusters": self.n_clusters}
96+
options = {
97+
"model_type": "KMEANS",
98+
"num_clusters": self.n_clusters,
99+
"KMEANS_INIT_METHOD": self.init,
100+
"DISTANCE_TYPE": self.distance_type,
101+
"MAX_ITERATIONS": self.max_iter,
102+
"MIN_REL_PROGRESS": self.tol,
103+
"WARM_START": self.warm_start,
104+
}
105+
106+
if self.init_col is not None:
107+
options["KMEANS_INIT_COL"] = self.init_col
108+
109+
return options
63110

64111
def _fit(
65112
self,

‎tests/system/large/ml/test_cluster.py

+46-2
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@
1919
from tests.system.utils import assert_pandas_df_equal
2020

2121

22-
@pytest.mark.flaky(retries=2, delay=120)
22+
@pytest.mark.flaky(retries=2)
2323
def test_cluster_configure_fit_score_predict(
2424
session, penguins_df_default_index, dataset_id
2525
):
26-
model = cluster.KMeans(n_clusters=3)
26+
model = cluster.KMeans(n_clusters=3, init="random")
2727

2828
df = penguins_df_default_index.dropna()[
2929
[
@@ -118,3 +118,47 @@ def test_cluster_configure_fit_score_predict(
118118
in reloaded_model._bqml_model.model_name
119119
)
120120
assert reloaded_model.n_clusters == 3
121+
assert reloaded_model.init == "RANDOM"
122+
assert reloaded_model.distance_type == "EUCLIDEAN"
123+
assert reloaded_model.max_iter == 20
124+
assert reloaded_model.tol == 0.01
125+
126+
127+
def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id):
128+
model = cluster.KMeans(
129+
n_clusters=4,
130+
init="random",
131+
distance_type="cosine",
132+
max_iter=30,
133+
tol=0.001,
134+
)
135+
136+
df = penguins_df_default_index.dropna()[
137+
[
138+
"culmen_length_mm",
139+
"culmen_depth_mm",
140+
"flipper_length_mm",
141+
"sex",
142+
]
143+
]
144+
145+
# TODO(swast): How should we handle the default index? Currently, we get:
146+
# "Column bigframes_index_0_z is not found in the input data to the
147+
# EVALUATE function."
148+
df = df.reset_index(drop=True)
149+
150+
model.fit(df)
151+
152+
# save, load, check n_clusters to ensure configuration was kept
153+
reloaded_model = model.to_gbq(
154+
f"{dataset_id}.temp_configured_cluster_model", replace=True
155+
)
156+
assert (
157+
f"{dataset_id}.temp_configured_cluster_model"
158+
in reloaded_model._bqml_model.model_name
159+
)
160+
assert reloaded_model.n_clusters == 4
161+
assert reloaded_model.init == "RANDOM"
162+
assert reloaded_model.distance_type == "COSINE"
163+
assert reloaded_model.max_iter == 30
164+
assert reloaded_model.tol == 0.001

‎third_party/bigframes_vendored/sklearn/cluster/_kmeans.py

+28
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,34 @@ class KMeans(_BaseKMeans):
3131
n_clusters (int, default 8):
3232
The number of clusters to form as well as the number of centroids to generate.
3333
Default to 8.
34+
35+
init ("kmeans++", "random" or "custom", default "kmeans++"):
36+
The method of initializing the clusters. Default to "kmeans++"
37+
38+
kmeas++: Initializes a number of centroids equal to the n_clusters value by using the k-means++ algorithm. Using this approach usually trains a better model than using random cluster initialization.
39+
random: Initializes the centroids by randomly selecting a number of data points equal to the n_clusters value from the input data.
40+
custom: Initializes the centroids using a provided column of type bool. Uses the rows with a value of True as the initial centroids. You specify the column to use by using the init_col option.
41+
42+
init_col (str or None, default None):
43+
The name of the column to use to initialize the centroids. This column must have a type of bool. If this column contains a value of True for a given row, then uses that row as an initial centroid. The number of True rows in this column must be equal to the value you have specified for the n_clusters option.
44+
Only works with init method "custom". Default to None.
45+
46+
distance_type ("euclidean" or "cosine", default "euclidean"):
47+
The type of metric to use to compute the distance between two points.
48+
Default to "euclidean".
49+
50+
max_iter (int, default 20):
51+
The maximum number of training iterations, where one iteration represents a single pass of the entire training data. Default to 20.
52+
53+
tol (float, default 0.01):
54+
The minimum relative loss improvement that is necessary to continue training. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue.
55+
Default to 0.01.
56+
57+
warm_start (bool, default False):
58+
Determines whether to train a model with new training data, new model options, or both. Unless you explicitly override them, the initial options used to train the model are used for the warm start run.
59+
Default to False.
60+
61+
3462
"""
3563

3664
def fit(

0 commit comments

Comments
 (0)
Failed to load comments.