Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
fix: Use exact median implementation by default
  • Loading branch information
TrevorBergeron committed Apr 17, 2024
commit 3838811a896f529249051f86a4c87940cb6aa4d7
3 changes: 2 additions & 1 deletion bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def quantile(
columns: Sequence[str],
qs: Sequence[float],
grouping_column_ids: Sequence[str] = (),
dropna: bool = False,
) -> blocks.Block:
# TODO: handle windowing and more interpolation methods
window = core.WindowSpec(
Expand All @@ -134,7 +135,7 @@ def quantile(
block, results = block.aggregate(
grouping_column_ids,
tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
dropna=True,
dropna=dropna,
)
return block.select_columns(results).with_column_labels(labels)

Expand Down
8 changes: 4 additions & 4 deletions bigframes/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
self._raise_on_non_numeric("mean")
return self._aggregate_all(agg_ops.mean_op, numeric_only=True)

def median(
self, numeric_only: bool = False, *, exact: bool = False
) -> df.DataFrame:
def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
if not numeric_only:
self._raise_on_non_numeric("median")
if exact:
Expand All @@ -138,6 +136,7 @@ def quantile(
q_cols,
qs=tuple(q) if multi_q else (q,), # type: ignore
grouping_column_ids=self._by_col_ids,
dropna=self._dropna,
)
result_df = df.DataFrame(result)
if multi_q:
Expand Down Expand Up @@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
def median(
self,
*args,
exact: bool = False,
exact: bool = True,
**kwargs,
) -> series.Series:
if exact:
Expand All @@ -508,6 +507,7 @@ def quantile(
(self._value_column,),
qs=tuple(q) if multi_q else (q,), # type: ignore
grouping_column_ids=self._by_col_ids,
dropna=self._dropna,
)
if multi_q:
return series.Series(result.stack())
Expand Down
8 changes: 2 additions & 6 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1999,18 +1999,14 @@ def mean(
return bigframes.series.Series(block.select_column("values"))

def median(
self, *, numeric_only: bool = False, exact: bool = False
self, *, numeric_only: bool = False, exact: bool = True
) -> bigframes.series.Series:
if exact:
raise NotImplementedError(
f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
)
if not numeric_only:
frame = self._raise_on_non_numeric("median")
else:
frame = self._drop_non_numeric()
if exact:
return self.quantile()
return frame.quantile()
else:
block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
return bigframes.series.Series(block.select_column("values"))
Expand Down
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ def mode(self) -> Series:
def mean(self) -> float:
return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))

def median(self, *, exact: bool = False) -> float:
def median(self, *, exact: bool = True) -> float:
if exact:
return typing.cast(float, self.quantile(0.5))
else:
Expand Down
24 changes: 22 additions & 2 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1523,12 +1523,32 @@ def test_groupby_mean(scalars_dfs):
)


def test_groupby_median(scalars_dfs):
def test_groupby_median_exact(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"
bf_series = (
bf_result = (
scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
)
pd_result = (
scalars_pandas_df[col_name]
.groupby(scalars_pandas_df["string_col"], dropna=False)
.median()
)

assert_series_equal(
pd_result,
bf_result.to_pandas(),
)


def test_groupby_median_inexact(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"
bf_series = (
scalars_df[col_name]
.groupby(scalars_df["string_col"], dropna=False)
.median(exact=False)
)
pd_max = (
scalars_pandas_df[col_name]
.groupby(scalars_pandas_df["string_col"], dropna=False)
Expand Down
6 changes: 3 additions & 3 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def median(self, *, numeric_only: bool = False, exact: bool = False):
def median(self, *, numeric_only: bool = False, exact: bool = True):
"""Return the median of the values over colunms.

**Examples:**
Expand All @@ -4507,8 +4507,8 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
Args:
numeric_only (bool. default False):
Default False. Include only float, int, boolean columns.
exact (bool. default False):
Default False. Get the exact median instead of an approximate
exact (bool. default True):
Default True. Get the exact median instead of an approximate
one.

Returns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,16 @@ def median(
self,
numeric_only: bool = False,
*,
exact: bool = False,
exact: bool = True,
):
"""
Compute median of groups, excluding missing values.

Args:
numeric_only (bool, default False):
Include only float, int, boolean columns.
exact (bool, default False):
Calculate the exact median instead of an approximation. Note:
``exact=True`` is not supported.
exact (bool, default True):
Calculate the exact median instead of an approximation.

Returns:
pandas.Series or pandas.DataFrame: Median of groups.
Expand Down
8 changes: 4 additions & 4 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3147,13 +3147,13 @@ def mean(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def median(self, *, exact: bool = False):
def median(self, *, exact: bool = True):
"""Return the median of the values over the requested axis.

Args:
exact (bool. default False):
Default False. Get the exact median instead of an approximate
one. Note: ``exact=True`` not yet supported.
exact (bool. default True):
Default True. Get the exact median instead of an approximate
one.

Returns:
scalar: Scalar.
Expand Down