feat: add "include" param to describe for string types (#973)

sycai · web-flow · commit deac6d2d6e45 · 2024-09-16T13:09:11.000-05:00
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -18,6 +18,7 @@
 
 import datetime
 import inspect
+import itertools
 import re
 import sys
 import textwrap
@@ -70,6 +71,7 @@
 import bigframes.exceptions
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
+import bigframes.operations.aggregations
 import bigframes.operations.aggregations as agg_ops
 import bigframes.operations.plotting as plotting
 import bigframes.operations.structs
@@ -2207,14 +2209,17 @@ def agg(
         self, func: str | typing.Sequence[str]
     ) -> DataFrame | bigframes.series.Series:
         if utils.is_list_like(func):
-            if any(
-                dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
-                for dtype in self.dtypes
-            ):
-                raise NotImplementedError(
-                    f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}"
-                )
             aggregations = [agg_ops.lookup_agg_func(f) for f in func]
+
+            for dtype, agg in itertools.product(self.dtypes, aggregations):
+                if not bigframes.operations.aggregations.is_agg_op_supported(
+                    dtype, agg
+                ):
+                    raise NotImplementedError(
+                        f"Type {dtype} does not support aggregation {agg}. "
+                        f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
+                    )
+
             return DataFrame(
                 self._block.summarize(
                     self._block.value_columns,
@@ -2280,16 +2285,55 @@ def melt(
             self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
         )
 
-    def describe(self) -> DataFrame:
-        df_numeric = self._drop_non_numeric(permissive=False)
-        if len(df_numeric.columns) == 0:
-            raise NotImplementedError(
-                f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}"
+    _NUMERICAL_DISCRIBE_AGGS = (
+        "count",
+        "mean",
+        "std",
+        "min",
+        "25%",
+        "50%",
+        "75%",
+        "max",
+    )
+    _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
+
+    def describe(self, include: None | Literal["all"] = None) -> DataFrame:
+        if include is None:
+            numeric_df = self._drop_non_numeric(permissive=False)
+            if len(numeric_df.columns) == 0:
+                # Describe eligible non-numerical columns
+                result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
+            else:
+                # Otherwise, only describe numerical columns
+                result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
+            return typing.cast(DataFrame, result)
+
+        elif include == "all":
+            numeric_result = typing.cast(
+                DataFrame,
+                self._drop_non_numeric(permissive=False).agg(
+                    self._NUMERICAL_DISCRIBE_AGGS
+                ),
+            )
+            string_result = typing.cast(
+                DataFrame,
+                self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
             )
-        result = df_numeric.agg(
-            ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
-        )
-        return typing.cast(DataFrame, result)
+
+            if len(numeric_result.columns) == 0:
+                return string_result
+            elif len(string_result.columns) == 0:
+                return numeric_result
+            else:
+                import bigframes.core.reshape as rs
+
+                # Use reindex after join to preserve the original column order.
+                return rs.concat(
+                    [numeric_result, string_result], axis=1
+                )._reindex_columns(self.columns)
+
+        else:
+            raise ValueError(f"Unsupported include type: {include}")
 
     def skew(self, *, numeric_only: bool = False):
         if not numeric_only:
@@ -2487,18 +2531,26 @@ def unstack(self, level: LevelsType = -1):
         return DataFrame(pivot_block)
 
     def _drop_non_numeric(self, permissive=True) -> DataFrame:
-        types_to_keep = (
+        numerical_types = (
             set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
         non_numeric_cols = [
             col_id
             for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype not in types_to_keep
+            if dtype not in numerical_types
         ]
         return DataFrame(self._block.drop_columns(non_numeric_cols))
 
+    def _drop_non_string(self) -> DataFrame:
+        string_cols = [
+            col_id
+            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            if dtype == bigframes.dtypes.STRING_DTYPE
+        ]
+        return DataFrame(self._block.select_columns(string_cols))
+
     def _drop_non_bool(self) -> DataFrame:
         non_bool_cols = [
             col_id
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -562,3 +562,14 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
         return _AGGREGATIONS_LOOKUP[key]
     else:
         raise ValueError(f"Unrecognize aggregate function: {key}")
+
+
+def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
+    if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
+        return True
+
+    if dtype == dtypes.STRING_DTYPE:
+        return isinstance(op, (CountOp, NuniqueOp))
+
+    # For all other types, support no aggregation
+    return False
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -2612,6 +2612,87 @@ def test_df_describe(scalars_dfs):
     ).all()
 
 
+@skip_legacy_pandas
+@pytest.mark.parametrize("include", [None, "all"])
+def test_df_describe_non_numerical(scalars_dfs, include):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    non_numerical_columns = ["string_col"]
+
+    modified_bf = scalars_df[non_numerical_columns]
+    bf_result = modified_bf.describe(include=include).to_pandas()
+
+    modified_pd_df = scalars_pandas_df[non_numerical_columns]
+    pd_result = modified_pd_df.describe(include=include)
+
+    # Reindex results with the specified keys and their order, because
+    # the relative order is not important.
+    bf_result = bf_result.reindex(["count", "nunique"])
+    pd_result = pd_result.reindex(
+        ["count", "unique"]
+        # BF counter part of "unique" is called "nunique"
+    ).rename(index={"unique": "nunique"})
+
+    pd.testing.assert_frame_equal(
+        pd_result[non_numerical_columns].astype("Int64"),
+        bf_result[non_numerical_columns],
+        check_index_type=False,
+    )
+
+
+@skip_legacy_pandas
+def test_df_describe_mixed_types_include_all(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    numerical_columns = [
+        "int64_col",
+        "float64_col",
+    ]
+    non_numerical_columns = ["string_col"]
+    supported_columns = numerical_columns + non_numerical_columns
+
+    modified_bf = scalars_df[supported_columns]
+    bf_result = modified_bf.describe(include="all").to_pandas()
+
+    modified_pd_df = scalars_pandas_df[supported_columns]
+    pd_result = modified_pd_df.describe(include="all")
+
+    # Drop quartiles, as they are approximate
+    bf_min = bf_result.loc["min", :]
+    bf_p25 = bf_result.loc["25%", :]
+    bf_p50 = bf_result.loc["50%", :]
+    bf_p75 = bf_result.loc["75%", :]
+    bf_max = bf_result.loc["max", :]
+
+    # Reindex results with the specified keys and their order, because
+    # the relative order is not important.
+    bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"])
+    pd_result = pd_result.reindex(
+        ["count", "unique", "mean", "std", "min", "max"]
+        # BF counter part of "unique" is called "nunique"
+    ).rename(index={"unique": "nunique"})
+
+    pd.testing.assert_frame_equal(
+        pd_result[numerical_columns].astype("Float64"),
+        bf_result[numerical_columns],
+        check_index_type=False,
+    )
+
+    pd.testing.assert_frame_equal(
+        pd_result[non_numerical_columns].astype("Int64"),
+        bf_result[non_numerical_columns],
+        check_index_type=False,
+    )
+
+    # Double-check that quantiles are at least plausible.
+    assert (
+        (bf_min <= bf_p25)
+        & (bf_p25 <= bf_p50)
+        & (bf_p50 <= bf_p50)
+        & (bf_p75 <= bf_max)
+    ).all()
+
+
 def test_df_transpose():
     # Include some floats to ensure type coercion
     values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]]
diff --git a/tests/unit/operations/__init__.py b/tests/unit/operations/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py
@@ -0,0 +1,92 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes.dtypes as dtypes
+from bigframes.operations.aggregations import (
+    all_op,
+    any_op,
+    count_op,
+    dense_rank_op,
+    first_op,
+    is_agg_op_supported,
+    max_op,
+    mean_op,
+    median_op,
+    min_op,
+    nunique_op,
+    product_op,
+    rank_op,
+    size_op,
+    std_op,
+    sum_op,
+    var_op,
+)
+
+_ALL_OPS = set(
+    [
+        size_op,
+        sum_op,
+        mean_op,
+        median_op,
+        product_op,
+        max_op,
+        min_op,
+        std_op,
+        var_op,
+        count_op,
+        nunique_op,
+        rank_op,
+        dense_rank_op,
+        all_op,
+        any_op,
+        first_op,
+    ]
+)
+_STRING_SUPPORTED_OPS = set([count_op, nunique_op])
+
+
+@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
+@pytest.mark.parametrize("op", _ALL_OPS)
+def test_is_agg_op_supported_numerical_support_all(dtype, op):
+    assert is_agg_op_supported(dtype, op) is True
+
+
+@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
+@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
+def test_is_agg_op_supported_string_support_ops(dtype, op):
+    assert is_agg_op_supported(dtype, op) is True
+
+
+@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
+@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
+def test_is_agg_op_supported_string_not_support_ops(dtype, op):
+    assert is_agg_op_supported(dtype, op) is False
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        dtypes.BYTES_DTYPE,
+        dtypes.DATE_DTYPE,
+        dtypes.TIME_DTYPE,
+        dtypes.DATETIME_DTYPE,
+        dtypes.TIMESTAMP_DTYPE,
+        dtypes.GEO_DTYPE,
+    ],
+)
+@pytest.mark.parametrize("op", _ALL_OPS)
+def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
+    assert is_agg_op_supported(dtype, op) is False