Skip to content

Commit cc48f58

Browse files
authored
feat: Support bool and bytes types in describe(include='all') (#994)
* feat: Support bool and bytes types in describe(include='all') * update aggregation unit tests * fix typo and remove unnecessary helper * remove unnecessary dep * fix wording
1 parent 4221632 commit cc48f58

File tree

4 files changed

+55
-60
lines changed

4 files changed

+55
-60
lines changed

bigframes/dataframe.py

+26-22
Original file line numberDiff line numberDiff line change
@@ -2303,7 +2303,7 @@ def melt(
23032303
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
23042304
)
23052305

2306-
_NUMERICAL_DISCRIBE_AGGS = (
2306+
_NUMERIC_DESCRIBE_AGGS = (
23072307
"count",
23082308
"mean",
23092309
"std",
@@ -2313,41 +2313,53 @@ def melt(
23132313
"75%",
23142314
"max",
23152315
)
2316-
_NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
2316+
_NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique")
23172317

23182318
def describe(self, include: None | Literal["all"] = None) -> DataFrame:
2319+
2320+
allowed_non_numeric_types = {
2321+
bigframes.dtypes.STRING_DTYPE,
2322+
bigframes.dtypes.BOOL_DTYPE,
2323+
bigframes.dtypes.BYTES_DTYPE,
2324+
}
2325+
23192326
if include is None:
23202327
numeric_df = self._drop_non_numeric(permissive=False)
23212328
if len(numeric_df.columns) == 0:
2322-
# Describe eligible non-numerical columns
2323-
result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
2329+
# Describe eligible non-numeric columns
2330+
result = self.select_dtypes(include=allowed_non_numeric_types).agg(
2331+
self._NON_NUMERIC_DESCRIBE_AGGS
2332+
)
23242333
else:
2325-
# Otherwise, only describe numerical columns
2326-
result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
2334+
# Otherwise, only describe numeric columns
2335+
result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS)
23272336
return typing.cast(DataFrame, result)
23282337

23292338
elif include == "all":
23302339
numeric_result = typing.cast(
23312340
DataFrame,
23322341
self._drop_non_numeric(permissive=False).agg(
2333-
self._NUMERICAL_DISCRIBE_AGGS
2342+
self._NUMERIC_DESCRIBE_AGGS
23342343
),
23352344
)
2336-
string_result = typing.cast(
2345+
2346+
non_numeric_result = typing.cast(
23372347
DataFrame,
2338-
self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
2348+
self.select_dtypes(include=allowed_non_numeric_types).agg(
2349+
self._NON_NUMERIC_DESCRIBE_AGGS
2350+
),
23392351
)
23402352

23412353
if len(numeric_result.columns) == 0:
2342-
return string_result
2343-
elif len(string_result.columns) == 0:
2354+
return non_numeric_result
2355+
elif len(non_numeric_result.columns) == 0:
23442356
return numeric_result
23452357
else:
23462358
import bigframes.core.reshape as rs
23472359

23482360
# Use reindex after join to preserve the original column order.
23492361
return rs.concat(
2350-
[numeric_result, string_result], axis=1
2362+
[non_numeric_result, numeric_result], axis=1
23512363
)._reindex_columns(self.columns)
23522364

23532365
else:
@@ -2549,26 +2561,18 @@ def unstack(self, level: LevelsType = -1):
25492561
return DataFrame(pivot_block)
25502562

25512563
def _drop_non_numeric(self, permissive=True) -> DataFrame:
2552-
numerical_types = (
2564+
numeric_types = (
25532565
set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
25542566
if permissive
25552567
else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
25562568
)
25572569
non_numeric_cols = [
25582570
col_id
25592571
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
2560-
if dtype not in numerical_types
2572+
if dtype not in numeric_types
25612573
]
25622574
return DataFrame(self._block.drop_columns(non_numeric_cols))
25632575

2564-
def _drop_non_string(self) -> DataFrame:
2565-
string_cols = [
2566-
col_id
2567-
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
2568-
if dtype == bigframes.dtypes.STRING_DTYPE
2569-
]
2570-
return DataFrame(self._block.select_columns(string_cols))
2571-
25722576
def _drop_non_bool(self) -> DataFrame:
25732577
non_bool_cols = [
25742578
col_id

bigframes/operations/aggregations.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
568568
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
569569
return True
570570

571-
if dtype == dtypes.STRING_DTYPE:
571+
if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
572572
return isinstance(op, (CountOp, NuniqueOp))
573573

574574
# For all other types, support no aggregation

tests/system/small/test_dataframe.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -2619,15 +2619,15 @@ def test_df_describe(scalars_dfs):
26192619

26202620
@skip_legacy_pandas
26212621
@pytest.mark.parametrize("include", [None, "all"])
2622-
def test_df_describe_non_numerical(scalars_dfs, include):
2622+
def test_df_describe_non_numeric(scalars_dfs, include):
26232623
scalars_df, scalars_pandas_df = scalars_dfs
26242624

2625-
non_numerical_columns = ["string_col"]
2625+
non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
26262626

2627-
modified_bf = scalars_df[non_numerical_columns]
2627+
modified_bf = scalars_df[non_numeric_columns]
26282628
bf_result = modified_bf.describe(include=include).to_pandas()
26292629

2630-
modified_pd_df = scalars_pandas_df[non_numerical_columns]
2630+
modified_pd_df = scalars_pandas_df[non_numeric_columns]
26312631
pd_result = modified_pd_df.describe(include=include)
26322632

26332633
# Reindex results with the specified keys and their order, because
@@ -2639,8 +2639,8 @@ def test_df_describe_non_numerical(scalars_dfs, include):
26392639
).rename(index={"unique": "nunique"})
26402640

26412641
pd.testing.assert_frame_equal(
2642-
pd_result[non_numerical_columns].astype("Int64"),
2643-
bf_result[non_numerical_columns],
2642+
pd_result[non_numeric_columns].astype("Int64"),
2643+
bf_result[non_numeric_columns],
26442644
check_index_type=False,
26452645
)
26462646

@@ -2649,12 +2649,12 @@ def test_df_describe_non_numerical(scalars_dfs, include):
26492649
def test_df_describe_mixed_types_include_all(scalars_dfs):
26502650
scalars_df, scalars_pandas_df = scalars_dfs
26512651

2652-
numerical_columns = [
2652+
numeric_columns = [
26532653
"int64_col",
26542654
"float64_col",
26552655
]
2656-
non_numerical_columns = ["string_col"]
2657-
supported_columns = numerical_columns + non_numerical_columns
2656+
non_numeric_columns = ["string_col"]
2657+
supported_columns = numeric_columns + non_numeric_columns
26582658

26592659
modified_bf = scalars_df[supported_columns]
26602660
bf_result = modified_bf.describe(include="all").to_pandas()
@@ -2678,14 +2678,14 @@ def test_df_describe_mixed_types_include_all(scalars_dfs):
26782678
).rename(index={"unique": "nunique"})
26792679

26802680
pd.testing.assert_frame_equal(
2681-
pd_result[numerical_columns].astype("Float64"),
2682-
bf_result[numerical_columns],
2681+
pd_result[numeric_columns].astype("Float64"),
2682+
bf_result[numeric_columns],
26832683
check_index_type=False,
26842684
)
26852685

26862686
pd.testing.assert_frame_equal(
2687-
pd_result[non_numerical_columns].astype("Int64"),
2688-
bf_result[non_numerical_columns],
2687+
pd_result[non_numeric_columns].astype("Int64"),
2688+
bf_result[non_numeric_columns],
26892689
check_index_type=False,
26902690
)
26912691

tests/unit/operations/test_aggregations.py

+15-24
Original file line numberDiff line numberDiff line change
@@ -55,38 +55,29 @@
5555
first_op,
5656
]
5757
)
58-
_STRING_SUPPORTED_OPS = set([count_op, nunique_op])
5958

6059

6160
@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
6261
@pytest.mark.parametrize("op", _ALL_OPS)
63-
def test_is_agg_op_supported_numerical_support_all(dtype, op):
62+
def test_is_agg_op_supported_numeric_support_all(dtype, op):
6463
assert is_agg_op_supported(dtype, op) is True
6564

6665

67-
@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
68-
@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
69-
def test_is_agg_op_supported_string_support_ops(dtype, op):
70-
assert is_agg_op_supported(dtype, op) is True
71-
72-
73-
@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
74-
@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
75-
def test_is_agg_op_supported_string_not_support_ops(dtype, op):
76-
assert is_agg_op_supported(dtype, op) is False
77-
78-
7966
@pytest.mark.parametrize(
80-
"dtype",
67+
("dtype", "supported_ops"),
8168
[
82-
dtypes.BYTES_DTYPE,
83-
dtypes.DATE_DTYPE,
84-
dtypes.TIME_DTYPE,
85-
dtypes.DATETIME_DTYPE,
86-
dtypes.TIMESTAMP_DTYPE,
87-
dtypes.GEO_DTYPE,
69+
(dtypes.STRING_DTYPE, {count_op, nunique_op}),
70+
(dtypes.BYTES_DTYPE, {count_op, nunique_op}),
71+
(dtypes.DATE_DTYPE, set()),
72+
(dtypes.TIME_DTYPE, set()),
73+
(dtypes.DATETIME_DTYPE, set()),
74+
(dtypes.TIMESTAMP_DTYPE, set()),
75+
(dtypes.GEO_DTYPE, set()),
8876
],
8977
)
90-
@pytest.mark.parametrize("op", _ALL_OPS)
91-
def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
92-
assert is_agg_op_supported(dtype, op) is False
78+
def test_is_agg_op_supported_non_numeric(dtype, supported_ops):
79+
for op in supported_ops:
80+
assert is_agg_op_supported(dtype, op) is True
81+
82+
for op in _ALL_OPS - supported_ops:
83+
assert is_agg_op_supported(dtype, op) is False

0 commit comments

Comments
 (0)