Skip to content

Commit 1fca588

Browse files
authored
feat: add GroupBy.size() to get number of rows in each group (#479)
1 parent 1832778 commit 1fca588

File tree

7 files changed

+194
-24
lines changed

7 files changed

+194
-24
lines changed

bigframes/core/blocks.py

+28
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,34 @@ def aggregate_all_and_stack(
10141014
index_labels=self.index.names,
10151015
)
10161016

1017+
def aggregate_size(
1018+
self,
1019+
by_column_ids: typing.Sequence[str] = (),
1020+
*,
1021+
dropna: bool = True,
1022+
):
1023+
"""Returns a block object to compute the size(s) of groups."""
1024+
agg_specs = [
1025+
(ex.NullaryAggregation(agg_ops.SizeOp()), guid.generate_guid()),
1026+
]
1027+
output_col_ids = [agg_spec[1] for agg_spec in agg_specs]
1028+
result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna)
1029+
names: typing.List[Label] = []
1030+
for by_col_id in by_column_ids:
1031+
if by_col_id in self.value_columns:
1032+
names.append(self.col_id_to_label[by_col_id])
1033+
else:
1034+
names.append(self.col_id_to_index_name[by_col_id])
1035+
return (
1036+
Block(
1037+
result_expr,
1038+
index_columns=by_column_ids,
1039+
column_labels=["size"],
1040+
index_labels=names,
1041+
),
1042+
output_col_ids,
1043+
)
1044+
10171045
def select_column(self, id: str) -> Block:
10181046
return self.select_columns([id])
10191047

bigframes/core/compile/aggregate_compiler.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ def compile_aggregate(
3636
bindings: typing.Dict[str, ibis_types.Value],
3737
order_by: typing.Sequence[ibis_types.Value] = [],
3838
) -> ibis_types.Value:
39+
if isinstance(aggregate, ex.NullaryAggregation):
40+
return compile_nullary_agg(aggregate.op)
3941
if isinstance(aggregate, ex.UnaryAggregation):
4042
input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
4143
if aggregate.op.can_order_by:
@@ -55,7 +57,9 @@ def compile_analytic(
5557
window: window_spec.WindowSpec,
5658
bindings: typing.Dict[str, ibis_types.Value],
5759
) -> ibis_types.Value:
58-
if isinstance(aggregate, ex.UnaryAggregation):
60+
if isinstance(aggregate, ex.NullaryAggregation):
61+
return compile_nullary_agg(aggregate.op, window)
62+
elif isinstance(aggregate, ex.UnaryAggregation):
5963
input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
6064
return compile_unary_agg(aggregate.op, input, window)
6165
elif isinstance(aggregate, ex.BinaryAggregation):
@@ -93,6 +97,14 @@ def compile_ordered_unary_agg(
9397
raise ValueError(f"Can't compile unrecognized operation: {op}")
9498

9599

100+
@functools.singledispatch
101+
def compile_nullary_agg(
102+
op: agg_ops.WindowOp,
103+
window: Optional[window_spec.WindowSpec] = None,
104+
) -> ibis_types.Value:
105+
raise ValueError(f"Can't compile unrecognized operation: {op}")
106+
107+
96108
def numeric_op(operation):
97109
@functools.wraps(operation)
98110
def constrained_op(
@@ -118,6 +130,11 @@ def constrained_op(
118130
### Specific Op implementations Below
119131

120132

133+
@compile_nullary_agg.register
134+
def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue:
135+
return _apply_window_if_present(vendored_ibis_ops.count(1), window)
136+
137+
121138
@compile_unary_agg.register
122139
@numeric_op
123140
def _(

bigframes/core/expression.py

+10
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,16 @@ def output_type(
4646
...
4747

4848

49+
@dataclasses.dataclass(frozen=True)
50+
class NullaryAggregation(Aggregation):
51+
op: agg_ops.NullaryWindowOp = dataclasses.field()
52+
53+
def output_type(
54+
self, input_types: dict[str, bigframes.dtypes.Dtype]
55+
) -> dtypes.ExpressionType:
56+
return self.op.output_type()
57+
58+
4959
@dataclasses.dataclass(frozen=True)
5060
class UnaryAggregation(Aggregation):
5161
op: agg_ops.UnaryWindowOp = dataclasses.field()

bigframes/core/groupby/__init__.py

+21
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,20 @@ def __getitem__(
104104
dropna=self._dropna,
105105
)
106106

107+
def size(self) -> typing.Union[df.DataFrame, series.Series]:
108+
agg_block, _ = self._block.aggregate_size(
109+
by_column_ids=self._by_col_ids,
110+
dropna=self._dropna,
111+
)
112+
agg_block = agg_block.with_column_labels(pd.Index(["size"]))
113+
dataframe = df.DataFrame(agg_block)
114+
115+
if self._as_index:
116+
series = dataframe["size"]
117+
return series.rename(None)
118+
else:
119+
return self._convert_index(dataframe)
120+
107121
def sum(self, numeric_only: bool = False, *args) -> df.DataFrame:
108122
if not numeric_only:
109123
self._raise_on_non_numeric("sum")
@@ -520,6 +534,13 @@ def std(self, *args, **kwargs) -> series.Series:
520534
def var(self, *args, **kwargs) -> series.Series:
521535
return self._aggregate(agg_ops.var_op)
522536

537+
def size(self) -> series.Series:
538+
agg_block, _ = self._block.aggregate_size(
539+
by_column_ids=self._by_col_ids,
540+
dropna=self._dropna,
541+
)
542+
return series.Series(agg_block, name=self._value_name)
543+
523544
def skew(self, *args, **kwargs) -> series.Series:
524545
block = block_ops.skew(self._block, [self._value_column], self._by_col_ids)
525546
return series.Series(block)

bigframes/operations/aggregations.py

+23
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
4747
...
4848

4949

50+
@dataclasses.dataclass(frozen=True)
51+
class NullaryWindowOp(WindowOp):
52+
@property
53+
def arguments(self) -> int:
54+
return 0
55+
56+
5057
@dataclasses.dataclass(frozen=True)
5158
class UnaryWindowOp(WindowOp):
5259
@property
@@ -72,6 +79,13 @@ def arguments(self) -> int:
7279
...
7380

7481

82+
@dataclasses.dataclass(frozen=True)
83+
class NullaryAggregateOp(AggregateOp, NullaryWindowOp):
84+
@property
85+
def arguments(self) -> int:
86+
return 0
87+
88+
7589
@dataclasses.dataclass(frozen=True)
7690
class UnaryAggregateOp(AggregateOp, UnaryWindowOp):
7791
@property
@@ -86,6 +100,14 @@ def arguments(self) -> int:
86100
return 2
87101

88102

103+
@dataclasses.dataclass(frozen=True)
104+
class SizeOp(NullaryAggregateOp):
105+
name: ClassVar[str] = "size"
106+
107+
def output_type(self, *input_types: dtypes.ExpressionType):
108+
return dtypes.INT_DTYPE
109+
110+
89111
@dataclasses.dataclass(frozen=True)
90112
class SumOp(UnaryAggregateOp):
91113
name: ClassVar[str] = "sum"
@@ -446,6 +468,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
446468
)
447469

448470

471+
size_op = SizeOp()
449472
sum_op = SumOp()
450473
mean_op = MeanOp()
451474
median_op = MedianOp()

tests/system/small/test_groupby.py

+84-23
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
import bigframes.pandas as bpd
1919
from tests.system.utils import assert_pandas_df_equal
2020

21+
# =================
22+
# DataFrame.groupby
23+
# =================
24+
2125

2226
@pytest.mark.parametrize(
2327
("operator"),
@@ -269,21 +273,26 @@ def test_dataframe_groupby_analytic(
269273
pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
270274

271275

272-
def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
273-
bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas()
274-
pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew()
276+
def test_dataframe_groupby_size_as_index_false(
277+
scalars_df_index, scalars_pandas_df_index
278+
):
279+
bf_result = scalars_df_index.groupby("string_col", as_index=False).size()
280+
bf_result_computed = bf_result.to_pandas()
281+
pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size()
275282

276-
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
283+
pd.testing.assert_frame_equal(
284+
pd_result, bf_result_computed, check_dtype=False, check_index_type=False
285+
)
277286

278287

279-
def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
280-
bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas()
281-
# Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139
282-
pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply(
283-
pd.Series.kurt
284-
)
288+
def test_dataframe_groupby_size_as_index_true(
289+
scalars_df_index, scalars_pandas_df_index
290+
):
291+
bf_result = scalars_df_index.groupby("string_col", as_index=True).size()
292+
pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size()
293+
bf_result_computed = bf_result.to_pandas()
285294

286-
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
295+
pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
287296

288297

289298
def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index):
@@ -356,6 +365,30 @@ def test_dataframe_groupby_getitem_list(
356365
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
357366

358367

368+
def test_dataframe_groupby_nonnumeric_with_mean():
369+
df = pd.DataFrame(
370+
{
371+
"key1": ["a", "a", "a", "b"],
372+
"key2": ["a", "a", "c", "c"],
373+
"key3": [1, 2, 3, 4],
374+
"key4": [1.6, 2, 3, 4],
375+
}
376+
)
377+
pd_result = df.groupby(["key1", "key2"]).mean()
378+
379+
with bpd.option_context("bigquery.location", "US"):
380+
bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas()
381+
382+
pd.testing.assert_frame_equal(
383+
pd_result, bf_result, check_index_type=False, check_dtype=False
384+
)
385+
386+
387+
# ==============
388+
# Series.groupby
389+
# ==============
390+
391+
359392
def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index):
360393
bf_result = (
361394
scalars_df_index["int64_col"]
@@ -392,21 +425,49 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
392425
)
393426

394427

395-
def test_dataframe_groupby_nonnumeric_with_mean():
396-
df = pd.DataFrame(
397-
{
398-
"key1": ["a", "a", "a", "b"],
399-
"key2": ["a", "a", "c", "c"],
400-
"key3": [1, 2, 3, 4],
401-
"key4": [1.6, 2, 3, 4],
402-
}
428+
def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
429+
bf_result = (
430+
scalars_df_index["int64_too"]
431+
.groupby(scalars_df_index["bool_col"])
432+
.kurt()
433+
.to_pandas()
434+
)
435+
# Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139
436+
pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply(
437+
pd.Series.kurt
403438
)
404-
pd_result = df.groupby(["key1", "key2"]).mean()
405-
bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas()
406439

407-
pd.testing.assert_frame_equal(
408-
pd_result, bf_result, check_index_type=False, check_dtype=False
440+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
441+
442+
443+
def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index):
444+
bf_result = (
445+
scalars_df_index["int64_too"].groupby(scalars_df_index["bool_col"]).size()
409446
)
447+
pd_result = (
448+
scalars_pandas_df_index["int64_too"]
449+
.groupby(scalars_pandas_df_index["bool_col"])
450+
.size()
451+
)
452+
bf_result_computed = bf_result.to_pandas()
453+
454+
pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
455+
456+
457+
def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
458+
bf_result = (
459+
scalars_df_index["int64_too"]
460+
.groupby(scalars_df_index["bool_col"])
461+
.skew()
462+
.to_pandas()
463+
)
464+
pd_result = (
465+
scalars_pandas_df_index["int64_too"]
466+
.groupby(scalars_pandas_df_index["bool_col"])
467+
.skew()
468+
)
469+
470+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
410471

411472

412473
@pytest.mark.parametrize(

third_party/bigframes_vendored/ibis/expr/operations/analytic.py

+10
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,19 @@
22

33
from __future__ import annotations
44

5+
import ibis
56
import ibis.expr.operations as ops
67
import ibis.expr.rules as rlz
78

89

10+
# TODO(swast): We can remove this if ibis adds aggregates over scalar values.
11+
# See: https://github.com/ibis-project/ibis/issues/8698
12+
@ibis.udf.agg.builtin
13+
def count(value: int) -> int:
14+
"""Count of a scalar."""
15+
return 0 # pragma: NO COVER
16+
17+
918
class FirstNonNullValue(ops.Analytic):
1019
"""Retrieve the first element."""
1120

@@ -21,6 +30,7 @@ class LastNonNullValue(ops.Analytic):
2130

2231

2332
__all__ = [
33+
"count",
2434
"FirstNonNullValue",
2535
"LastNonNullValue",
2636
]

0 commit comments

Comments
 (0)