Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 443db22

Browse files
authoredFeb 7, 2024
feat: add Series.cov method (#368)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 91596b8 commit 443db22

File tree

9 files changed

+73
-56
lines changed

9 files changed

+73
-56
lines changed
 

‎bigframes/core/__init__.py

-22
Original file line numberDiff line numberDiff line change
@@ -261,28 +261,6 @@ def aggregate(
261261
)
262262
)
263263

264-
def corr_aggregate(
265-
self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]]
266-
) -> ArrayValue:
267-
"""
268-
Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id.
269-
This uses BigQuery's CORR under the hood, and thus only Pearson's method is used.
270-
Arguments:
271-
corr_aggregations: left_column_id, right_column_id, output_column_id tuples
272-
"""
273-
aggregations = tuple(
274-
(
275-
ex.BinaryAggregation(
276-
agg_ops.CorrOp(), ex.free_var(agg[0]), ex.free_var(agg[1])
277-
),
278-
agg[2],
279-
)
280-
for agg in corr_aggregations
281-
)
282-
return ArrayValue(
283-
nodes.AggregateNode(child=self.node, aggregations=aggregations)
284-
)
285-
286264
def project_window_op(
287265
self,
288266
column_name: str,

‎bigframes/core/blocks.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -1040,26 +1040,29 @@ def get_stat(self, column_id: str, stat: agg_ops.UnaryAggregateOp):
10401040
self._stats_cache[column_id].update(stats_map)
10411041
return stats_map[stat.name]
10421042

1043-
def get_corr_stat(self, column_id_left: str, column_id_right: str):
1043+
def get_binary_stat(
1044+
self, column_id_left: str, column_id_right: str, stat: agg_ops.BinaryAggregateOp
1045+
):
10441046
# TODO(kemppeterson): Clean up the column names for DataFrames.corr support
10451047
# TODO(kemppeterson): Add a cache here.
1046-
corr_aggregations = [
1048+
aggregations = [
10471049
(
1048-
column_id_left,
1049-
column_id_right,
1050-
"corr_" + column_id_left + column_id_right,
1050+
ex.BinaryAggregation(
1051+
stat, ex.free_var(column_id_left), ex.free_var(column_id_right)
1052+
),
1053+
f"{stat.name}_{column_id_left}{column_id_right}",
10511054
)
10521055
]
1053-
expr = self.expr.corr_aggregate(corr_aggregations)
1056+
expr = self.expr.aggregate(aggregations)
10541057
offset_index_id = guid.generate_guid()
10551058
expr = expr.promote_offsets(offset_index_id)
10561059
block = Block(
10571060
expr,
10581061
index_columns=[offset_index_id],
1059-
column_labels=[a[2] for a in corr_aggregations],
1062+
column_labels=[a[1] for a in aggregations],
10601063
)
10611064
df, _ = block.to_pandas()
1062-
return df.loc[0, "corr_" + column_id_left + column_id_right]
1065+
return df.loc[0, f"{stat.name}_{column_id_left}{column_id_right}"]
10631066

10641067
def summarize(
10651068
self,

‎bigframes/core/compile/aggregate_compiler.py

+13
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,19 @@ def _(
431431
return cast(ibis_types.NumericColumn, bq_corr)
432432

433433

434+
@compile_binary_agg.register
435+
def _(
436+
op: agg_ops.CovOp, left: ibis_types.Column, right: ibis_types.Column, window=None
437+
) -> ibis_types.NumericValue:
438+
# Will be null if all inputs are null. Pandas defaults to zero sum though.
439+
left_numeric = cast(ibis_types.NumericColumn, left)
440+
right_numeric = cast(ibis_types.NumericColumn, right)
441+
bq_cov = _apply_window_if_present(
442+
left_numeric.cov(right_numeric, how="sample"), window
443+
)
444+
return cast(ibis_types.NumericColumn, bq_cov)
445+
446+
434447
def _apply_window_if_present(value: ibis_types.Value, window):
435448
return value.over(window) if (window is not None) else value
436449

‎bigframes/operations/aggregations.py

+5
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,11 @@ class CorrOp(BinaryAggregateOp):
265265
name: ClassVar[str] = "corr"
266266

267267

268+
@dataclasses.dataclass(frozen=True)
269+
class CovOp(BinaryAggregateOp):
270+
name: ClassVar[str] = "cov"
271+
272+
268273
sum_op = SumOp()
269274
mean_op = MeanOp()
270275
median_op = MedianOp()

‎bigframes/operations/base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import bigframes.core.scalar as scalars
2626
import bigframes.dtypes
2727
import bigframes.operations as ops
28+
import bigframes.operations.aggregations as agg_ops
2829
import bigframes.series as series
2930
import bigframes.session
3031
import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing
@@ -188,10 +189,12 @@ def _apply_binary_op(
188189
block, result_id = self._block.project_expr(expr, name)
189190
return series.Series(block.select_column(result_id))
190191

191-
def _apply_corr_aggregation(self, other: series.Series) -> float:
192+
def _apply_binary_aggregation(
193+
self, other: series.Series, stat: agg_ops.BinaryAggregateOp
194+
) -> float:
192195
(left, right, block) = self._align(other, how="outer")
193196

194-
return block.get_corr_stat(left, right)
197+
return block.get_binary_stat(left, right, stat)
195198

196199
def _align(self, other: series.Series, how="outer") -> tuple[str, str, blocks.Block]: # type: ignore
197200
"""Aligns the series value with another scalar or series object. Returns new left column id, right column id and joined tabled expression."""

‎bigframes/series.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -734,8 +734,8 @@ def round(self, decimals=0) -> "Series":
734734
return self._apply_binary_op(decimals, ops.round_op)
735735

736736
def corr(self, other: Series, method="pearson", min_periods=None) -> float:
737-
# TODO(kemppeterson): Validate early that both are numeric
738-
# TODO(kemppeterson): Handle partially-numeric columns
737+
# TODO(tbergeron): Validate early that both are numeric
738+
# TODO(tbergeron): Handle partially-numeric columns
739739
if method != "pearson":
740740
raise NotImplementedError(
741741
f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}"
@@ -744,7 +744,10 @@ def corr(self, other: Series, method="pearson", min_periods=None) -> float:
744744
raise NotImplementedError(
745745
f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
746746
)
747-
return self._apply_corr_aggregation(other)
747+
return self._apply_binary_aggregation(other, agg_ops.CorrOp())
748+
749+
def cov(self, other: Series) -> float:
750+
return self._apply_binary_aggregation(other, agg_ops.CovOp())
748751

749752
def all(self) -> bool:
750753
return typing.cast(bool, self._apply_aggregation(agg_ops.all_op))

‎tests/system/small/test_series.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ def test_mods(scalars_dfs, col_x, col_y, method):
656656

657657
# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this
658658
# manually with dumb self-correlation instead of parameterized as test_mods is above.
659-
def test_corr(scalars_dfs):
659+
def test_series_corr(scalars_dfs):
660660
scalars_df, scalars_pandas_df = scalars_dfs
661661
bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"])
662662
pd_result = (
@@ -667,6 +667,17 @@ def test_corr(scalars_dfs):
667667
assert math.isclose(pd_result, bf_result)
668668

669669

670+
def test_series_cov(scalars_dfs):
671+
scalars_df, scalars_pandas_df = scalars_dfs
672+
bf_result = scalars_df["int64_too"].cov(scalars_df["int64_too"])
673+
pd_result = (
674+
scalars_pandas_df["int64_too"]
675+
.astype("int64")
676+
.cov(scalars_pandas_df["int64_too"].astype("int64"))
677+
)
678+
assert math.isclose(pd_result, bf_result)
679+
680+
670681
@pytest.mark.parametrize(
671682
("col_x",),
672683
[

‎tests/unit/test_core.py

-20
Original file line numberDiff line numberDiff line change
@@ -208,23 +208,3 @@ def test_arrayvalue_to_ibis_expr_with_aggregate():
208208
assert actual.columns[0] == "col1"
209209
assert actual.columns[1] == "col4"
210210
assert expr.columns[1].type().is_int64()
211-
212-
213-
def test_arrayvalue_to_ibis_expr_with_corr_aggregate():
214-
value = resources.create_arrayvalue(
215-
pandas.DataFrame(
216-
{
217-
"col1": [1, 2, 3],
218-
"col2": ["a", "b", "c"],
219-
"col3": [0.1, 0.2, 0.3],
220-
}
221-
),
222-
total_ordering_columns=["col1"],
223-
)
224-
expr = value.corr_aggregate(
225-
corr_aggregations=[("col1", "col3", "col4")]
226-
)._compile_ordered()
227-
actual = expr._to_ibis_expr(ordering_mode="unordered")
228-
assert len(expr.columns) == 1
229-
assert actual.columns[0] == "col4"
230-
assert expr.columns[0].type().is_float64()

‎third_party/bigframes_vendored/pandas/core/series.py

+21
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,27 @@ def corr(self, other, method="pearson", min_periods=None) -> float:
842842
"""
843843
raise NotImplementedError("abstract method")
844844

845+
def cov(
846+
self,
847+
other,
848+
) -> float:
849+
"""
850+
Compute covariance with Series, excluding missing values.
851+
852+
The two `Series` objects are not required to be the same length and
853+
will be aligned internally before the covariance is calculated.
854+
855+
Args:
856+
other (Series):
857+
Series with which to compute the covariance.
858+
859+
Returns:
860+
float:
861+
Covariance between Series and other normalized by N-1
862+
(unbiased estimator).
863+
"""
864+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
865+
845866
def diff(self) -> Series:
846867
"""
847868
First discrete difference of element.

0 commit comments

Comments
 (0)
Failed to load comments.