Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit deac6d2

Browse files
authoredSep 16, 2024
feat: add "include" param to describe for string types (#973)
1 parent 596b03b commit deac6d2

File tree

5 files changed

+267
-18
lines changed

5 files changed

+267
-18
lines changed
 

‎bigframes/dataframe.py

+70-18
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import datetime
2020
import inspect
21+
import itertools
2122
import re
2223
import sys
2324
import textwrap
@@ -70,6 +71,7 @@
7071
import bigframes.exceptions
7172
import bigframes.formatting_helpers as formatter
7273
import bigframes.operations as ops
74+
import bigframes.operations.aggregations
7375
import bigframes.operations.aggregations as agg_ops
7476
import bigframes.operations.plotting as plotting
7577
import bigframes.operations.structs
@@ -2207,14 +2209,17 @@ def agg(
22072209
self, func: str | typing.Sequence[str]
22082210
) -> DataFrame | bigframes.series.Series:
22092211
if utils.is_list_like(func):
2210-
if any(
2211-
dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
2212-
for dtype in self.dtypes
2213-
):
2214-
raise NotImplementedError(
2215-
f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}"
2216-
)
22172212
aggregations = [agg_ops.lookup_agg_func(f) for f in func]
2213+
2214+
for dtype, agg in itertools.product(self.dtypes, aggregations):
2215+
if not bigframes.operations.aggregations.is_agg_op_supported(
2216+
dtype, agg
2217+
):
2218+
raise NotImplementedError(
2219+
f"Type {dtype} does not support aggregation {agg}. "
2220+
f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
2221+
)
2222+
22182223
return DataFrame(
22192224
self._block.summarize(
22202225
self._block.value_columns,
@@ -2280,16 +2285,55 @@ def melt(
22802285
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
22812286
)
22822287

2283-
def describe(self) -> DataFrame:
2284-
df_numeric = self._drop_non_numeric(permissive=False)
2285-
if len(df_numeric.columns) == 0:
2286-
raise NotImplementedError(
2287-
f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}"
2288+
_NUMERICAL_DISCRIBE_AGGS = (
2289+
"count",
2290+
"mean",
2291+
"std",
2292+
"min",
2293+
"25%",
2294+
"50%",
2295+
"75%",
2296+
"max",
2297+
)
2298+
_NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
2299+
2300+
def describe(self, include: None | Literal["all"] = None) -> DataFrame:
2301+
if include is None:
2302+
numeric_df = self._drop_non_numeric(permissive=False)
2303+
if len(numeric_df.columns) == 0:
2304+
# Describe eligible non-numerical columns
2305+
result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
2306+
else:
2307+
# Otherwise, only describe numerical columns
2308+
result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
2309+
return typing.cast(DataFrame, result)
2310+
2311+
elif include == "all":
2312+
numeric_result = typing.cast(
2313+
DataFrame,
2314+
self._drop_non_numeric(permissive=False).agg(
2315+
self._NUMERICAL_DISCRIBE_AGGS
2316+
),
2317+
)
2318+
string_result = typing.cast(
2319+
DataFrame,
2320+
self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
22882321
)
2289-
result = df_numeric.agg(
2290-
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
2291-
)
2292-
return typing.cast(DataFrame, result)
2322+
2323+
if len(numeric_result.columns) == 0:
2324+
return string_result
2325+
elif len(string_result.columns) == 0:
2326+
return numeric_result
2327+
else:
2328+
import bigframes.core.reshape as rs
2329+
2330+
# Use reindex after join to preserve the original column order.
2331+
return rs.concat(
2332+
[numeric_result, string_result], axis=1
2333+
)._reindex_columns(self.columns)
2334+
2335+
else:
2336+
raise ValueError(f"Unsupported include type: {include}")
22932337

22942338
def skew(self, *, numeric_only: bool = False):
22952339
if not numeric_only:
@@ -2487,18 +2531,26 @@ def unstack(self, level: LevelsType = -1):
24872531
return DataFrame(pivot_block)
24882532

24892533
def _drop_non_numeric(self, permissive=True) -> DataFrame:
2490-
types_to_keep = (
2534+
numerical_types = (
24912535
set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
24922536
if permissive
24932537
else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
24942538
)
24952539
non_numeric_cols = [
24962540
col_id
24972541
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
2498-
if dtype not in types_to_keep
2542+
if dtype not in numerical_types
24992543
]
25002544
return DataFrame(self._block.drop_columns(non_numeric_cols))
25012545

2546+
def _drop_non_string(self) -> DataFrame:
2547+
string_cols = [
2548+
col_id
2549+
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
2550+
if dtype == bigframes.dtypes.STRING_DTYPE
2551+
]
2552+
return DataFrame(self._block.select_columns(string_cols))
2553+
25022554
def _drop_non_bool(self) -> DataFrame:
25032555
non_bool_cols = [
25042556
col_id

‎bigframes/operations/aggregations.py

+11
Original file line numberDiff line numberDiff line change
@@ -562,3 +562,14 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
562562
return _AGGREGATIONS_LOOKUP[key]
563563
else:
564564
raise ValueError(f"Unrecognize aggregate function: {key}")
565+
566+
567+
def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
568+
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
569+
return True
570+
571+
if dtype == dtypes.STRING_DTYPE:
572+
return isinstance(op, (CountOp, NuniqueOp))
573+
574+
# For all other types, support no aggregation
575+
return False

‎tests/system/small/test_dataframe.py

+81
Original file line numberDiff line numberDiff line change
@@ -2612,6 +2612,87 @@ def test_df_describe(scalars_dfs):
26122612
).all()
26132613

26142614

2615+
@skip_legacy_pandas
2616+
@pytest.mark.parametrize("include", [None, "all"])
2617+
def test_df_describe_non_numerical(scalars_dfs, include):
2618+
scalars_df, scalars_pandas_df = scalars_dfs
2619+
2620+
non_numerical_columns = ["string_col"]
2621+
2622+
modified_bf = scalars_df[non_numerical_columns]
2623+
bf_result = modified_bf.describe(include=include).to_pandas()
2624+
2625+
modified_pd_df = scalars_pandas_df[non_numerical_columns]
2626+
pd_result = modified_pd_df.describe(include=include)
2627+
2628+
# Reindex results with the specified keys and their order, because
2629+
# the relative order is not important.
2630+
bf_result = bf_result.reindex(["count", "nunique"])
2631+
pd_result = pd_result.reindex(
2632+
["count", "unique"]
2633+
# BF counter part of "unique" is called "nunique"
2634+
).rename(index={"unique": "nunique"})
2635+
2636+
pd.testing.assert_frame_equal(
2637+
pd_result[non_numerical_columns].astype("Int64"),
2638+
bf_result[non_numerical_columns],
2639+
check_index_type=False,
2640+
)
2641+
2642+
2643+
@skip_legacy_pandas
2644+
def test_df_describe_mixed_types_include_all(scalars_dfs):
2645+
scalars_df, scalars_pandas_df = scalars_dfs
2646+
2647+
numerical_columns = [
2648+
"int64_col",
2649+
"float64_col",
2650+
]
2651+
non_numerical_columns = ["string_col"]
2652+
supported_columns = numerical_columns + non_numerical_columns
2653+
2654+
modified_bf = scalars_df[supported_columns]
2655+
bf_result = modified_bf.describe(include="all").to_pandas()
2656+
2657+
modified_pd_df = scalars_pandas_df[supported_columns]
2658+
pd_result = modified_pd_df.describe(include="all")
2659+
2660+
# Drop quartiles, as they are approximate
2661+
bf_min = bf_result.loc["min", :]
2662+
bf_p25 = bf_result.loc["25%", :]
2663+
bf_p50 = bf_result.loc["50%", :]
2664+
bf_p75 = bf_result.loc["75%", :]
2665+
bf_max = bf_result.loc["max", :]
2666+
2667+
# Reindex results with the specified keys and their order, because
2668+
# the relative order is not important.
2669+
bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"])
2670+
pd_result = pd_result.reindex(
2671+
["count", "unique", "mean", "std", "min", "max"]
2672+
# BF counter part of "unique" is called "nunique"
2673+
).rename(index={"unique": "nunique"})
2674+
2675+
pd.testing.assert_frame_equal(
2676+
pd_result[numerical_columns].astype("Float64"),
2677+
bf_result[numerical_columns],
2678+
check_index_type=False,
2679+
)
2680+
2681+
pd.testing.assert_frame_equal(
2682+
pd_result[non_numerical_columns].astype("Int64"),
2683+
bf_result[non_numerical_columns],
2684+
check_index_type=False,
2685+
)
2686+
2687+
# Double-check that quantiles are at least plausible.
2688+
assert (
2689+
(bf_min <= bf_p25)
2690+
& (bf_p25 <= bf_p50)
2691+
& (bf_p50 <= bf_p50)
2692+
& (bf_p75 <= bf_max)
2693+
).all()
2694+
2695+
26152696
def test_df_transpose():
26162697
# Include some floats to ensure type coercion
26172698
values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]]

‎tests/unit/operations/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
import bigframes.dtypes as dtypes
18+
from bigframes.operations.aggregations import (
19+
all_op,
20+
any_op,
21+
count_op,
22+
dense_rank_op,
23+
first_op,
24+
is_agg_op_supported,
25+
max_op,
26+
mean_op,
27+
median_op,
28+
min_op,
29+
nunique_op,
30+
product_op,
31+
rank_op,
32+
size_op,
33+
std_op,
34+
sum_op,
35+
var_op,
36+
)
37+
38+
_ALL_OPS = set(
39+
[
40+
size_op,
41+
sum_op,
42+
mean_op,
43+
median_op,
44+
product_op,
45+
max_op,
46+
min_op,
47+
std_op,
48+
var_op,
49+
count_op,
50+
nunique_op,
51+
rank_op,
52+
dense_rank_op,
53+
all_op,
54+
any_op,
55+
first_op,
56+
]
57+
)
58+
_STRING_SUPPORTED_OPS = set([count_op, nunique_op])
59+
60+
61+
@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
62+
@pytest.mark.parametrize("op", _ALL_OPS)
63+
def test_is_agg_op_supported_numerical_support_all(dtype, op):
64+
assert is_agg_op_supported(dtype, op) is True
65+
66+
67+
@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
68+
@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
69+
def test_is_agg_op_supported_string_support_ops(dtype, op):
70+
assert is_agg_op_supported(dtype, op) is True
71+
72+
73+
@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
74+
@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
75+
def test_is_agg_op_supported_string_not_support_ops(dtype, op):
76+
assert is_agg_op_supported(dtype, op) is False
77+
78+
79+
@pytest.mark.parametrize(
80+
"dtype",
81+
[
82+
dtypes.BYTES_DTYPE,
83+
dtypes.DATE_DTYPE,
84+
dtypes.TIME_DTYPE,
85+
dtypes.DATETIME_DTYPE,
86+
dtypes.TIMESTAMP_DTYPE,
87+
dtypes.GEO_DTYPE,
88+
],
89+
)
90+
@pytest.mark.parametrize("op", _ALL_OPS)
91+
def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
92+
assert is_agg_op_supported(dtype, op) is False

0 commit comments

Comments
 (0)
Failed to load comments.