diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 562689a736..a221b343a5 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -111,6 +111,7 @@ def quantile( columns: Sequence[str], qs: Sequence[float], grouping_column_ids: Sequence[str] = (), + dropna: bool = False, ) -> blocks.Block: # TODO: handle windowing and more interpolation methods window = core.WindowSpec( @@ -134,7 +135,7 @@ def quantile( block, results = block.aggregate( grouping_column_ids, tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols), - dropna=True, + dropna=dropna, ) return block.select_columns(results).with_column_labels(labels) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 0f53342352..05b1cc7f41 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: self._raise_on_non_numeric("mean") return self._aggregate_all(agg_ops.mean_op, numeric_only=True) - def median( - self, numeric_only: bool = False, *, exact: bool = False - ) -> df.DataFrame: + def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("median") if exact: @@ -138,6 +136,7 @@ def quantile( q_cols, qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, + dropna=self._dropna, ) result_df = df.DataFrame(result) if multi_q: @@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series: def median( self, *args, - exact: bool = False, + exact: bool = True, **kwargs, ) -> series.Series: if exact: @@ -508,6 +507,7 @@ def quantile( (self._value_column,), qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, + dropna=self._dropna, ) if multi_q: return series.Series(result.stack()) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 11e592542c..ff8404761c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1995,18 +1995,16 @@ def mean( return bigframes.series.Series(block.select_column("values")) def median( - self, *, numeric_only: bool = False, exact: bool = False + self, *, numeric_only: bool = False, exact: bool = True ) -> bigframes.series.Series: - if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) if not numeric_only: frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() if exact: - return self.quantile() + result = frame.quantile() + result.name = None + return result else: block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) diff --git a/bigframes/series.py b/bigframes/series.py index b834411bce..47acfd0afb 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -966,7 +966,7 @@ def mode(self) -> Series: def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) - def median(self, *, exact: bool = False) -> float: + def median(self, *, exact: bool = True) -> float: if exact: return typing.cast(float, self.quantile(0.5)) else: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 87267696ba..9cb615fdcb 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs): scalars_df, _ = scalars_dfs col_name = "numeric_col" assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) - bf_result = scalars_df[col_name] - scalars_df[col_name].median() + bf_result = scalars_df[col_name] + 42 assert bf_result.size == scalars_df[col_name].size - # TODO(b/323387826): The precision increased by 1 unexpectedly. - # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) def test_repr(scalars_dfs): @@ -1523,12 +1522,32 @@ def test_groupby_mean(scalars_dfs): ) -def test_groupby_median(scalars_dfs): +def test_groupby_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_series = ( + bf_result = ( scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() ) + pd_result = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .median() + ) + + assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_groupby_median_inexact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby(scalars_df["string_col"], dropna=False) + .median(exact=False) + ) pd_max = ( scalars_pandas_df[col_name] .groupby(scalars_pandas_df["string_col"], dropna=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f06128f150..0515f690e3 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, numeric_only: bool = False, exact: bool = False): + def median(self, *, numeric_only: bool = False, exact: bool = True): """Return the median of the values over colunms. **Examples:** @@ -4500,15 +4500,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Finding the median value of each column. >>> df.median() - A 1 - B 2 - dtype: Int64 + A 2.0 + B 3.0 + dtype: Float64 Args: numeric_only (bool. default False): Default False. Include only float, int, boolean columns. - exact (bool. default False): - Default False. Get the exact median instead of an approximate + exact (bool. default True): + Default True. Get the exact median instead of an approximate one. Returns: diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 7347963d17..f3f7748e34 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -68,7 +68,7 @@ def median( self, numeric_only: bool = False, *, - exact: bool = False, + exact: bool = True, ): """ Compute median of groups, excluding missing values. @@ -76,9 +76,8 @@ def median( Args: numeric_only (bool, default False): Include only float, int, boolean columns. - exact (bool, default False): - Calculate the exact median instead of an approximation. Note: - ``exact=True`` is not supported. + exact (bool, default True): + Calculate the exact median instead of an approximation. Returns: pandas.Series or pandas.DataFrame: Median of groups. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a5e14c5b1c..0c5b8d4521 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3150,13 +3150,13 @@ def mean(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, exact: bool = False): + def median(self, *, exact: bool = True): """Return the median of the values over the requested axis. Args: - exact (bool. default False): - Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + exact (bool. default True): + Default True. Get the exact median instead of an approximate + one. Returns: scalar: Scalar.