feat: support list of numerics in pandas.cut (#580)

milkshakeiii · web-flow · commit 290f95dc5198 · 2024-04-12T23:58:17.000Z
An internal user encountered this missing overload
diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Iterable, Literal, Optional, Tuple, Union
+from typing import Iterable, Literal, Optional, Union
 
 import pandas as pd
 
@@ -113,7 +113,7 @@ def cut(
     bins: Union[
         int,
         pd.IntervalIndex,
-        Iterable[Tuple[Union[int, float], Union[int, float]]],
+        Iterable,
     ],
     *,
     labels: Optional[bool] = None,
@@ -125,9 +125,29 @@ def cut(
         if isinstance(bins, pd.IntervalIndex):
             as_index: pd.IntervalIndex = bins
             bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
-        else:
+        elif len(list(bins)) == 0:
+            raise ValueError("`bins` iterable should have at least one item")
+        elif isinstance(list(bins)[0], tuple):
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
+        elif pd.api.types.is_number(list(bins)[0]):
+            bins_list = list(bins)
+            if len(bins_list) < 2:
+                raise ValueError(
+                    "`bins` iterable of numeric breaks should have"
+                    " at least two items"
+                )
+            as_index = pd.IntervalIndex.from_breaks(bins_list)
+            single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
+            numeric_type = type(bins_list[0]) if single_type else float
+            bins = tuple(
+                [
+                    (numeric_type(bins_list[i]), numeric_type(bins_list[i + 1]))
+                    for i in range(len(bins_list) - 1)
+                ]
+            )
+        else:
+            raise ValueError("`bins` iterable should contain tuples or numerics")
 
         if as_index.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -17,7 +17,7 @@
 import abc
 import dataclasses
 import typing
-from typing import ClassVar, Hashable, Optional, Tuple
+from typing import ClassVar, Iterable, Optional
 
 import pandas as pd
 import pyarrow as pa
@@ -213,7 +213,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 @dataclasses.dataclass(frozen=True)
 class CutOp(UnaryWindowOp):
     # TODO: Unintuitive, refactor into multiple ops?
-    bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]]
+    bins: typing.Union[int, Iterable]
     labels: Optional[bool]
 
     @property
@@ -232,7 +232,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
             interval_dtype = (
                 pa.float64()
                 if isinstance(self.bins, int)
-                else dtypes.infer_literal_arrow_type(self.bins[0][0])
+                else dtypes.infer_literal_arrow_type(list(self.bins)[0][0])
             )
             pa_type = pa.struct(
                 [
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
@@ -424,6 +424,58 @@ def test_cut_default_labels(scalars_dfs):
     )
 
 
+@pytest.mark.parametrize(
+    ("breaks",),
+    [
+        ([0, 5, 10, 15, 20, 100, 1000],),  # ints
+        ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],),  # floats
+        ([0, 5, 10.5, 15.5, 20, 100, 1000.5],),  # mixed
+    ],
+)
+def test_cut_numeric_breaks(scalars_dfs, breaks):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks)
+    bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas()
+
+    # Convert to match data format
+    pd_result_converted = pd.Series(
+        [
+            {"left_exclusive": interval.left, "right_inclusive": interval.right}
+            if pd.notna(val)
+            else pd.NA
+            for val, interval in zip(
+                pd_result, pd_result.cat.categories[pd_result.cat.codes]
+            )
+        ],
+        name=pd_result.name,
+    )
+
+    pd.testing.assert_series_equal(
+        bf_result, pd_result_converted, check_index=False, check_dtype=False
+    )
+
+
+@pytest.mark.parametrize(
+    ("bins",),
+    [
+        (-1,),  # negative integer bins argument
+        ([],),  # empty iterable of bins
+        (["notabreak"],),  # iterable of wrong type
+        ([1],),  # numeric breaks with only one numeric
+        # this is supported by pandas but not by
+        # the bigquery operation and a bigframes workaround
+        # is not yet available. Should return column
+        # of structs with all NaN values.
+    ],
+)
+def test_cut_errors(scalars_dfs, bins):
+    scalars_df, _ = scalars_dfs
+
+    with pytest.raises(ValueError):
+        bpd.cut(scalars_df["float64_col"], bins)
+
+
 @pytest.mark.parametrize(
     ("bins",),
     [
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
@@ -76,10 +76,20 @@ def cut(
         3    {'left_exclusive': 5, 'right_inclusive': 20}
         dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]
 
+    Cut with an iterable of ints:
+
+        >>> bins_ints = [0, 1, 5, 20]
+        >>> bpd.cut(s, bins=bins_ints)
+        0                                            <NA>
+        1     {'left_exclusive': 0, 'right_inclusive': 1}
+        2     {'left_exclusive': 1, 'right_inclusive': 5}
+        3    {'left_exclusive': 5, 'right_inclusive': 20}
+        dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]
+
     Args:
         x (Series):
             The input Series to be binned. Must be 1-dimensional.
-        bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]):
+        bins (int, pd.IntervalIndex, Iterable):
             The criteria to bin by.
 
             int: Defines the number of equal-width bins in the range of `x`. The
@@ -88,6 +98,10 @@ def cut(
 
             pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
             It's important to ensure that these bins are non-overlapping.
+
+            Iterable of numerics: Defines the exact bins by using the interval
+            between each item and its following item. The items must be monotonically
+            increasing.
         labels (None):
             Specifies the labels for the returned bins. Must be the same length as
             the resulting bins. If False, returns only integer indicators of the