fix: Use exact median implementation by default (#619)

TrevorBergeron · web-flow · commit 9d205aecb77f · 2024-04-18T15:31:03.000-07:00
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -111,6 +111,7 @@ def quantile(
     columns: Sequence[str],
     qs: Sequence[float],
     grouping_column_ids: Sequence[str] = (),
+    dropna: bool = False,
 ) -> blocks.Block:
     # TODO: handle windowing and more interpolation methods
     window = core.WindowSpec(
@@ -134,7 +135,7 @@ def quantile(
     block, results = block.aggregate(
         grouping_column_ids,
         tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
-        dropna=True,
+        dropna=dropna,
     )
     return block.select_columns(results).with_column_labels(labels)
 
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
@@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
             self._raise_on_non_numeric("mean")
         return self._aggregate_all(agg_ops.mean_op, numeric_only=True)
 
-    def median(
-        self, numeric_only: bool = False, *, exact: bool = False
-    ) -> df.DataFrame:
+    def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
         if not numeric_only:
             self._raise_on_non_numeric("median")
         if exact:
@@ -138,6 +136,7 @@ def quantile(
             q_cols,
             qs=tuple(q) if multi_q else (q,),  # type: ignore
             grouping_column_ids=self._by_col_ids,
+            dropna=self._dropna,
         )
         result_df = df.DataFrame(result)
         if multi_q:
@@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
     def median(
         self,
         *args,
-        exact: bool = False,
+        exact: bool = True,
         **kwargs,
     ) -> series.Series:
         if exact:
@@ -508,6 +507,7 @@ def quantile(
             (self._value_column,),
             qs=tuple(q) if multi_q else (q,),  # type: ignore
             grouping_column_ids=self._by_col_ids,
+            dropna=self._dropna,
         )
         if multi_q:
             return series.Series(result.stack())
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1995,18 +1995,16 @@ def mean(
         return bigframes.series.Series(block.select_column("values"))
 
     def median(
-        self, *, numeric_only: bool = False, exact: bool = False
+        self, *, numeric_only: bool = False, exact: bool = True
     ) -> bigframes.series.Series:
-        if exact:
-            raise NotImplementedError(
-                f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
-            )
         if not numeric_only:
             frame = self._raise_on_non_numeric("median")
         else:
             frame = self._drop_non_numeric()
         if exact:
-            return self.quantile()
+            result = frame.quantile()
+            result.name = None
+            return result
         else:
             block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
             return bigframes.series.Series(block.select_column("values"))
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -966,7 +966,7 @@ def mode(self) -> Series:
     def mean(self) -> float:
         return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))
 
-    def median(self, *, exact: bool = False) -> float:
+    def median(self, *, exact: bool = True) -> float:
         if exact:
             return typing.cast(float, self.quantile(0.5))
         else:
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs):
     scalars_df, _ = scalars_dfs
     col_name = "numeric_col"
     assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9))
-    bf_result = scalars_df[col_name] - scalars_df[col_name].median()
+    bf_result = scalars_df[col_name] + 42
     assert bf_result.size == scalars_df[col_name].size
-    # TODO(b/323387826): The precision increased by 1 unexpectedly.
-    # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
+    assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
 
 
 def test_repr(scalars_dfs):
@@ -1523,12 +1522,32 @@ def test_groupby_mean(scalars_dfs):
     )
 
 
-def test_groupby_median(scalars_dfs):
+def test_groupby_median_exact(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_too"
-    bf_series = (
+    bf_result = (
         scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
     )
+    pd_result = (
+        scalars_pandas_df[col_name]
+        .groupby(scalars_pandas_df["string_col"], dropna=False)
+        .median()
+    )
+
+    assert_series_equal(
+        pd_result,
+        bf_result.to_pandas(),
+    )
+
+
+def test_groupby_median_inexact(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    col_name = "int64_too"
+    bf_series = (
+        scalars_df[col_name]
+        .groupby(scalars_df["string_col"], dropna=False)
+        .median(exact=False)
+    )
     pd_max = (
         scalars_pandas_df[col_name]
         .groupby(scalars_pandas_df["string_col"], dropna=False)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def median(self, *, numeric_only: bool = False, exact: bool = False):
+    def median(self, *, numeric_only: bool = False, exact: bool = True):
         """Return the median of the values over colunms.
 
         **Examples:**
@@ -4500,15 +4500,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
         Finding the median value of each column.
 
             >>> df.median()
-            A    1
-            B    2
-            dtype: Int64
+            A    2.0
+            B    3.0
+            dtype: Float64
 
         Args:
             numeric_only (bool. default False):
                 Default False. Include only float, int, boolean columns.
-            exact (bool. default False):
-                Default False. Get the exact median instead of an approximate
+            exact (bool. default True):
+                Default True. Get the exact median instead of an approximate
                 one.
 
         Returns:
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -68,17 +68,16 @@ def median(
         self,
         numeric_only: bool = False,
         *,
-        exact: bool = False,
+        exact: bool = True,
     ):
         """
         Compute median of groups, excluding missing values.
 
         Args:
             numeric_only (bool, default False):
                 Include only float, int, boolean columns.
-            exact (bool, default False):
-                Calculate the exact median instead of an approximation. Note:
-                    ``exact=True`` is not supported.
+            exact (bool, default True):
+                Calculate the exact median instead of an approximation.
 
         Returns:
             pandas.Series or pandas.DataFrame: Median of groups.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
@@ -3150,13 +3150,13 @@ def mean(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def median(self, *, exact: bool = False):
+    def median(self, *, exact: bool = True):
         """Return the median of the values over the requested axis.
 
         Args:
-            exact (bool. default False):
-                Default False. Get the exact median instead of an approximate
-                one. Note: ``exact=True`` not yet supported.
+            exact (bool. default True):
+                Default True. Get the exact median instead of an approximate
+                one.
 
         Returns:
             scalar: Scalar.