Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def quantile(
columns: Sequence[str],
qs: Sequence[float],
grouping_column_ids: Sequence[str] = (),
dropna: bool = False,
) -> blocks.Block:
# TODO: handle windowing and more interpolation methods
window = core.WindowSpec(
Expand All @@ -134,7 +135,7 @@ def quantile(
block, results = block.aggregate(
grouping_column_ids,
tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
dropna=True,
dropna=dropna,
)
return block.select_columns(results).with_column_labels(labels)

Expand Down
8 changes: 4 additions & 4 deletions bigframes/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
self._raise_on_non_numeric("mean")
return self._aggregate_all(agg_ops.mean_op, numeric_only=True)

def median(
self, numeric_only: bool = False, *, exact: bool = False
) -> df.DataFrame:
def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
if not numeric_only:
self._raise_on_non_numeric("median")
if exact:
Expand All @@ -138,6 +136,7 @@ def quantile(
q_cols,
qs=tuple(q) if multi_q else (q,), # type: ignore
grouping_column_ids=self._by_col_ids,
dropna=self._dropna,
)
result_df = df.DataFrame(result)
if multi_q:
Expand Down Expand Up @@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
def median(
self,
*args,
exact: bool = False,
exact: bool = True,
**kwargs,
) -> series.Series:
if exact:
Expand All @@ -508,6 +507,7 @@ def quantile(
(self._value_column,),
qs=tuple(q) if multi_q else (q,), # type: ignore
grouping_column_ids=self._by_col_ids,
dropna=self._dropna,
)
if multi_q:
return series.Series(result.stack())
Expand Down
8 changes: 2 additions & 6 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1999,18 +1999,14 @@ def mean(
return bigframes.series.Series(block.select_column("values"))

def median(
self, *, numeric_only: bool = False, exact: bool = False
self, *, numeric_only: bool = False, exact: bool = True
) -> bigframes.series.Series:
if exact:
raise NotImplementedError(
f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
)
if not numeric_only:
frame = self._raise_on_non_numeric("median")
else:
frame = self._drop_non_numeric()
if exact:
return self.quantile()
return frame.quantile()
else:
block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
return bigframes.series.Series(block.select_column("values"))
Expand Down
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ def mode(self) -> Series:
def mean(self) -> float:
return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))

def median(self, *, exact: bool = False) -> float:
def median(self, *, exact: bool = True) -> float:
if exact:
return typing.cast(float, self.quantile(0.5))
else:
Expand Down
24 changes: 22 additions & 2 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1523,12 +1523,32 @@ def test_groupby_mean(scalars_dfs):
)


def test_groupby_median(scalars_dfs):
def test_groupby_median_exact(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"
bf_series = (
bf_result = (
scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
)
pd_result = (
scalars_pandas_df[col_name]
.groupby(scalars_pandas_df["string_col"], dropna=False)
.median()
)

assert_series_equal(
pd_result,
bf_result.to_pandas(),
)


def test_groupby_median_inexact(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"
bf_series = (
scalars_df[col_name]
.groupby(scalars_df["string_col"], dropna=False)
.median(exact=False)
)
pd_max = (
scalars_pandas_df[col_name]
.groupby(scalars_pandas_df["string_col"], dropna=False)
Expand Down
6 changes: 3 additions & 3 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def median(self, *, numeric_only: bool = False, exact: bool = False):
def median(self, *, numeric_only: bool = False, exact: bool = True):
"""Return the median of the values over colunms.

**Examples:**
Expand All @@ -4507,8 +4507,8 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
Args:
numeric_only (bool. default False):
Default False. Include only float, int, boolean columns.
exact (bool. default False):
Default False. Get the exact median instead of an approximate
exact (bool. default True):
Default True. Get the exact median instead of an approximate
one.

Returns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,16 @@ def median(
self,
numeric_only: bool = False,
*,
exact: bool = False,
exact: bool = True,
):
"""
Compute median of groups, excluding missing values.

Args:
numeric_only (bool, default False):
Include only float, int, boolean columns.
exact (bool, default False):
Calculate the exact median instead of an approximation. Note:
``exact=True`` is not supported.
exact (bool, default True):
Calculate the exact median instead of an approximation.

Returns:
pandas.Series or pandas.DataFrame: Median of groups.
Expand Down
8 changes: 4 additions & 4 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3147,13 +3147,13 @@ def mean(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def median(self, *, exact: bool = False):
def median(self, *, exact: bool = True):
"""Return the median of the values over the requested axis.

Args:
exact (bool. default False):
Default False. Get the exact median instead of an approximate
one. Note: ``exact=True`` not yet supported.
exact (bool. default True):
Default True. Get the exact median instead of an approximate
one.

Returns:
scalar: Scalar.
Expand Down