feat: Support bool and bytes types in describe(include='all') (#994)

sycai · web-flow · commit cc48f58cbd94 · 2024-09-19T18:42:15.000-05:00
* feat: Support bool and bytes types in describe(include='all')

* update aggregation unit tests

* fix typo and remove unnecessary helper

* remove unnecessary dep

* fix wording
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2303,7 +2303,7 @@ def melt(
             self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
         )
 
-    _NUMERICAL_DISCRIBE_AGGS = (
+    _NUMERIC_DESCRIBE_AGGS = (
         "count",
         "mean",
         "std",
@@ -2313,41 +2313,53 @@ def melt(
         "75%",
         "max",
     )
-    _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
+    _NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique")
 
     def describe(self, include: None | Literal["all"] = None) -> DataFrame:
+
+        allowed_non_numeric_types = {
+            bigframes.dtypes.STRING_DTYPE,
+            bigframes.dtypes.BOOL_DTYPE,
+            bigframes.dtypes.BYTES_DTYPE,
+        }
+
         if include is None:
             numeric_df = self._drop_non_numeric(permissive=False)
             if len(numeric_df.columns) == 0:
-                # Describe eligible non-numerical columns
-                result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
+                # Describe eligible non-numeric columns
+                result = self.select_dtypes(include=allowed_non_numeric_types).agg(
+                    self._NON_NUMERIC_DESCRIBE_AGGS
+                )
             else:
-                # Otherwise, only describe numerical columns
-                result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
+                # Otherwise, only describe numeric columns
+                result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS)
             return typing.cast(DataFrame, result)
 
         elif include == "all":
             numeric_result = typing.cast(
                 DataFrame,
                 self._drop_non_numeric(permissive=False).agg(
-                    self._NUMERICAL_DISCRIBE_AGGS
+                    self._NUMERIC_DESCRIBE_AGGS
                 ),
             )
-            string_result = typing.cast(
+
+            non_numeric_result = typing.cast(
                 DataFrame,
-                self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
+                self.select_dtypes(include=allowed_non_numeric_types).agg(
+                    self._NON_NUMERIC_DESCRIBE_AGGS
+                ),
             )
 
             if len(numeric_result.columns) == 0:
-                return string_result
-            elif len(string_result.columns) == 0:
+                return non_numeric_result
+            elif len(non_numeric_result.columns) == 0:
                 return numeric_result
             else:
                 import bigframes.core.reshape as rs
 
                 # Use reindex after join to preserve the original column order.
                 return rs.concat(
-                    [numeric_result, string_result], axis=1
+                    [non_numeric_result, numeric_result], axis=1
                 )._reindex_columns(self.columns)
 
         else:
@@ -2549,26 +2561,18 @@ def unstack(self, level: LevelsType = -1):
         return DataFrame(pivot_block)
 
     def _drop_non_numeric(self, permissive=True) -> DataFrame:
-        numerical_types = (
+        numeric_types = (
             set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
         non_numeric_cols = [
             col_id
             for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype not in numerical_types
+            if dtype not in numeric_types
         ]
         return DataFrame(self._block.drop_columns(non_numeric_cols))
 
-    def _drop_non_string(self) -> DataFrame:
-        string_cols = [
-            col_id
-            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype == bigframes.dtypes.STRING_DTYPE
-        ]
-        return DataFrame(self._block.select_columns(string_cols))
-
     def _drop_non_bool(self) -> DataFrame:
         non_bool_cols = [
             col_id
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -568,7 +568,7 @@ def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
     if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
         return True
 
-    if dtype == dtypes.STRING_DTYPE:
+    if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
         return isinstance(op, (CountOp, NuniqueOp))
 
     # For all other types, support no aggregation
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -2619,15 +2619,15 @@ def test_df_describe(scalars_dfs):
 
 @skip_legacy_pandas
 @pytest.mark.parametrize("include", [None, "all"])
-def test_df_describe_non_numerical(scalars_dfs, include):
+def test_df_describe_non_numeric(scalars_dfs, include):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    non_numerical_columns = ["string_col"]
+    non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
 
-    modified_bf = scalars_df[non_numerical_columns]
+    modified_bf = scalars_df[non_numeric_columns]
     bf_result = modified_bf.describe(include=include).to_pandas()
 
-    modified_pd_df = scalars_pandas_df[non_numerical_columns]
+    modified_pd_df = scalars_pandas_df[non_numeric_columns]
     pd_result = modified_pd_df.describe(include=include)
 
     # Reindex results with the specified keys and their order, because
@@ -2639,8 +2639,8 @@ def test_df_describe_non_numerical(scalars_dfs, include):
     ).rename(index={"unique": "nunique"})
 
     pd.testing.assert_frame_equal(
-        pd_result[non_numerical_columns].astype("Int64"),
-        bf_result[non_numerical_columns],
+        pd_result[non_numeric_columns].astype("Int64"),
+        bf_result[non_numeric_columns],
         check_index_type=False,
     )
 
@@ -2649,12 +2649,12 @@ def test_df_describe_non_numerical(scalars_dfs, include):
 def test_df_describe_mixed_types_include_all(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    numerical_columns = [
+    numeric_columns = [
         "int64_col",
         "float64_col",
     ]
-    non_numerical_columns = ["string_col"]
-    supported_columns = numerical_columns + non_numerical_columns
+    non_numeric_columns = ["string_col"]
+    supported_columns = numeric_columns + non_numeric_columns
 
     modified_bf = scalars_df[supported_columns]
     bf_result = modified_bf.describe(include="all").to_pandas()
@@ -2678,14 +2678,14 @@ def test_df_describe_mixed_types_include_all(scalars_dfs):
     ).rename(index={"unique": "nunique"})
 
     pd.testing.assert_frame_equal(
-        pd_result[numerical_columns].astype("Float64"),
-        bf_result[numerical_columns],
+        pd_result[numeric_columns].astype("Float64"),
+        bf_result[numeric_columns],
         check_index_type=False,
     )
 
     pd.testing.assert_frame_equal(
-        pd_result[non_numerical_columns].astype("Int64"),
-        bf_result[non_numerical_columns],
+        pd_result[non_numeric_columns].astype("Int64"),
+        bf_result[non_numeric_columns],
         check_index_type=False,
     )
 
diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py
@@ -55,38 +55,29 @@
         first_op,
     ]
 )
-_STRING_SUPPORTED_OPS = set([count_op, nunique_op])
 
 
 @pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
 @pytest.mark.parametrize("op", _ALL_OPS)
-def test_is_agg_op_supported_numerical_support_all(dtype, op):
+def test_is_agg_op_supported_numeric_support_all(dtype, op):
     assert is_agg_op_supported(dtype, op) is True
 
 
-@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
-@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
-def test_is_agg_op_supported_string_support_ops(dtype, op):
-    assert is_agg_op_supported(dtype, op) is True
-
-
-@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
-@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
-def test_is_agg_op_supported_string_not_support_ops(dtype, op):
-    assert is_agg_op_supported(dtype, op) is False
-
-
 @pytest.mark.parametrize(
-    "dtype",
+    ("dtype", "supported_ops"),
     [
-        dtypes.BYTES_DTYPE,
-        dtypes.DATE_DTYPE,
-        dtypes.TIME_DTYPE,
-        dtypes.DATETIME_DTYPE,
-        dtypes.TIMESTAMP_DTYPE,
-        dtypes.GEO_DTYPE,
+        (dtypes.STRING_DTYPE, {count_op, nunique_op}),
+        (dtypes.BYTES_DTYPE, {count_op, nunique_op}),
+        (dtypes.DATE_DTYPE, set()),
+        (dtypes.TIME_DTYPE, set()),
+        (dtypes.DATETIME_DTYPE, set()),
+        (dtypes.TIMESTAMP_DTYPE, set()),
+        (dtypes.GEO_DTYPE, set()),
     ],
 )
-@pytest.mark.parametrize("op", _ALL_OPS)
-def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
-    assert is_agg_op_supported(dtype, op) is False
+def test_is_agg_op_supported_non_numeric(dtype, supported_ops):
+    for op in supported_ops:
+        assert is_agg_op_supported(dtype, op) is True
+
+    for op in _ALL_OPS - supported_ops:
+        assert is_agg_op_supported(dtype, op) is False