Skip to content

Commit 9d205ae

Browse files
fix: Use exact median implementation by default (#619)
1 parent 3706b4f commit 9d205ae

File tree

8 files changed

+48
-31
lines changed

8 files changed

+48
-31
lines changed

bigframes/core/block_transforms.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def quantile(
111111
columns: Sequence[str],
112112
qs: Sequence[float],
113113
grouping_column_ids: Sequence[str] = (),
114+
dropna: bool = False,
114115
) -> blocks.Block:
115116
# TODO: handle windowing and more interpolation methods
116117
window = core.WindowSpec(
@@ -134,7 +135,7 @@ def quantile(
134135
block, results = block.aggregate(
135136
grouping_column_ids,
136137
tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
137-
dropna=True,
138+
dropna=dropna,
138139
)
139140
return block.select_columns(results).with_column_labels(labels)
140141

bigframes/core/groupby/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
113113
self._raise_on_non_numeric("mean")
114114
return self._aggregate_all(agg_ops.mean_op, numeric_only=True)
115115

116-
def median(
117-
self, numeric_only: bool = False, *, exact: bool = False
118-
) -> df.DataFrame:
116+
def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
119117
if not numeric_only:
120118
self._raise_on_non_numeric("median")
121119
if exact:
@@ -138,6 +136,7 @@ def quantile(
138136
q_cols,
139137
qs=tuple(q) if multi_q else (q,), # type: ignore
140138
grouping_column_ids=self._by_col_ids,
139+
dropna=self._dropna,
141140
)
142141
result_df = df.DataFrame(result)
143142
if multi_q:
@@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
491490
def median(
492491
self,
493492
*args,
494-
exact: bool = False,
493+
exact: bool = True,
495494
**kwargs,
496495
) -> series.Series:
497496
if exact:
@@ -508,6 +507,7 @@ def quantile(
508507
(self._value_column,),
509508
qs=tuple(q) if multi_q else (q,), # type: ignore
510509
grouping_column_ids=self._by_col_ids,
510+
dropna=self._dropna,
511511
)
512512
if multi_q:
513513
return series.Series(result.stack())

bigframes/dataframe.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -1995,18 +1995,16 @@ def mean(
19951995
return bigframes.series.Series(block.select_column("values"))
19961996

19971997
def median(
1998-
self, *, numeric_only: bool = False, exact: bool = False
1998+
self, *, numeric_only: bool = False, exact: bool = True
19991999
) -> bigframes.series.Series:
2000-
if exact:
2001-
raise NotImplementedError(
2002-
f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
2003-
)
20042000
if not numeric_only:
20052001
frame = self._raise_on_non_numeric("median")
20062002
else:
20072003
frame = self._drop_non_numeric()
20082004
if exact:
2009-
return self.quantile()
2005+
result = frame.quantile()
2006+
result.name = None
2007+
return result
20102008
else:
20112009
block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
20122010
return bigframes.series.Series(block.select_column("values"))

bigframes/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -966,7 +966,7 @@ def mode(self) -> Series:
966966
def mean(self) -> float:
967967
return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))
968968

969-
def median(self, *, exact: bool = False) -> float:
969+
def median(self, *, exact: bool = True) -> float:
970970
if exact:
971971
return typing.cast(float, self.quantile(0.5))
972972
else:

tests/system/small/test_series.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs):
13451345
scalars_df, _ = scalars_dfs
13461346
col_name = "numeric_col"
13471347
assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9))
1348-
bf_result = scalars_df[col_name] - scalars_df[col_name].median()
1348+
bf_result = scalars_df[col_name] + 42
13491349
assert bf_result.size == scalars_df[col_name].size
1350-
# TODO(b/323387826): The precision increased by 1 unexpectedly.
1351-
# assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
1350+
assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
13521351

13531352

13541353
def test_repr(scalars_dfs):
@@ -1523,12 +1522,32 @@ def test_groupby_mean(scalars_dfs):
15231522
)
15241523

15251524

1526-
def test_groupby_median(scalars_dfs):
1525+
def test_groupby_median_exact(scalars_dfs):
15271526
scalars_df, scalars_pandas_df = scalars_dfs
15281527
col_name = "int64_too"
1529-
bf_series = (
1528+
bf_result = (
15301529
scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
15311530
)
1531+
pd_result = (
1532+
scalars_pandas_df[col_name]
1533+
.groupby(scalars_pandas_df["string_col"], dropna=False)
1534+
.median()
1535+
)
1536+
1537+
assert_series_equal(
1538+
pd_result,
1539+
bf_result.to_pandas(),
1540+
)
1541+
1542+
1543+
def test_groupby_median_inexact(scalars_dfs):
1544+
scalars_df, scalars_pandas_df = scalars_dfs
1545+
col_name = "int64_too"
1546+
bf_series = (
1547+
scalars_df[col_name]
1548+
.groupby(scalars_df["string_col"], dropna=False)
1549+
.median(exact=False)
1550+
)
15321551
pd_max = (
15331552
scalars_pandas_df[col_name]
15341553
.groupby(scalars_pandas_df["string_col"], dropna=False)

third_party/bigframes_vendored/pandas/core/frame.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
44814481
"""
44824482
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
44834483

4484-
def median(self, *, numeric_only: bool = False, exact: bool = False):
4484+
def median(self, *, numeric_only: bool = False, exact: bool = True):
44854485
"""Return the median of the values over colunms.
44864486
44874487
**Examples:**
@@ -4500,15 +4500,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
45004500
Finding the median value of each column.
45014501
45024502
>>> df.median()
4503-
A 1
4504-
B 2
4505-
dtype: Int64
4503+
A 2.0
4504+
B 3.0
4505+
dtype: Float64
45064506
45074507
Args:
45084508
numeric_only (bool. default False):
45094509
Default False. Include only float, int, boolean columns.
4510-
exact (bool. default False):
4511-
Default False. Get the exact median instead of an approximate
4510+
exact (bool. default True):
4511+
Default True. Get the exact median instead of an approximate
45124512
one.
45134513
45144514
Returns:

third_party/bigframes_vendored/pandas/core/groupby/__init__.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,16 @@ def median(
6868
self,
6969
numeric_only: bool = False,
7070
*,
71-
exact: bool = False,
71+
exact: bool = True,
7272
):
7373
"""
7474
Compute median of groups, excluding missing values.
7575
7676
Args:
7777
numeric_only (bool, default False):
7878
Include only float, int, boolean columns.
79-
exact (bool, default False):
80-
Calculate the exact median instead of an approximation. Note:
81-
``exact=True`` is not supported.
79+
exact (bool, default True):
80+
Calculate the exact median instead of an approximation.
8281
8382
Returns:
8483
pandas.Series or pandas.DataFrame: Median of groups.

third_party/bigframes_vendored/pandas/core/series.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3150,13 +3150,13 @@ def mean(self):
31503150
"""
31513151
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
31523152

3153-
def median(self, *, exact: bool = False):
3153+
def median(self, *, exact: bool = True):
31543154
"""Return the median of the values over the requested axis.
31553155
31563156
Args:
3157-
exact (bool. default False):
3158-
Default False. Get the exact median instead of an approximate
3159-
one. Note: ``exact=True`` not yet supported.
3157+
exact (bool. default True):
3158+
Default True. Get the exact median instead of an approximate
3159+
one.
31603160
31613161
Returns:
31623162
scalar: Scalar.

0 commit comments

Comments
 (0)