Skip to content

feat: add replace method to DataFrame #261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 19, 2023
15 changes: 15 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1562,6 +1562,21 @@ def interpolate(self, method: str = "linear") -> DataFrame:
def fillna(self, value=None) -> DataFrame:
return self._apply_binop(value, ops.fillna_op, how="left")

def replace(
self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
):
if utils.is_dict_like(value):
return self.apply(
lambda x: x.replace(
to_replace=to_replace, value=value[x.name], regex=regex
)
if (x.name in value)
else x
)
return self.apply(
lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)
)

def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
window = bigframes.core.WindowSpec(preceding=limit, following=0)
return self._apply_window_op(agg_ops.LastNonNullOp(), window)
Expand Down
48 changes: 48 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

"""Mappings for Pandas dtypes supported by BigQuery DataFrames package"""

import datetime
import textwrap
import typing
from typing import Any, Dict, Iterable, Literal, Tuple, Union
Expand Down Expand Up @@ -437,3 +438,50 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict:
gcb3p_pandas_helpers.bq_to_arrow_data_type(field)
)
return dtypes


def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
"""Captures whether a scalar can be losslessly represented by a dtype."""
if scalar is None:
return True
if pd.api.types.is_bool_dtype(dtype):
return pd.api.types.is_bool(scalar)
if pd.api.types.is_float_dtype(dtype):
return pd.api.types.is_float(scalar)
if pd.api.types.is_integer_dtype(dtype):
return pd.api.types.is_integer(scalar)
if isinstance(dtype, pd.StringDtype):
return isinstance(scalar, str)
if isinstance(dtype, pd.ArrowDtype):
pa_type = dtype.pyarrow_dtype
return is_patype(scalar, pa_type)
return False


def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
"""Determine whether a scalar's type matches a given pyarrow type."""
if pa_type == pa.time64("us"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing docstring here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added docstring

return isinstance(scalar, datetime.time)
if pa_type == pa.timestamp("us"):
if isinstance(scalar, datetime.datetime):
return not scalar.tzinfo
if isinstance(scalar, pd.Timestamp):
return not scalar.tzinfo
if pa_type == pa.timestamp("us", tz="UTC"):
if isinstance(scalar, datetime.datetime):
return scalar.tzinfo == datetime.timezone.utc
if isinstance(scalar, pd.Timestamp):
return scalar.tzinfo == datetime.timezone.utc
if pa_type == pa.date32():
return isinstance(scalar, datetime.date)
return False


def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool:
"""Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
if is_dtype(scalar, dtype):
return True
elif pd.api.types.is_numeric_dtype(dtype):
return pd.api.types.is_number(scalar)
else:
return False
14 changes: 14 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,20 @@ def _as_ibis(self, x: ibis_types.Value):
return bigframes.dtypes.cast_ibis_value(x, self.to_type)


class MapOp(UnaryOp):
def __init__(
self,
mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...],
):
self._mappings = mappings

def _as_ibis(self, x: ibis_types.Value):
case = ibis.case()
for mapping in self._mappings:
case = case.when(x == mapping[0], mapping[1])
return case.else_(x).end()


class FindOp(UnaryOp):
def __init__(self, sub, start, end):
self._sub = sub
Expand Down
87 changes: 56 additions & 31 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,42 +442,67 @@ def replace(
self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
):
if regex:
if not (isinstance(to_replace, str) and isinstance(value, str)):
raise NotImplementedError(
f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}"
)
block, result_col = self._block.apply_unary_op(
self._value_column,
ops.ReplaceRegexOp(to_replace, value),
result_label=self.name,
)
return Series(block.select_column(result_col))
# No-op unless to_replace and series dtype are both string type
if not isinstance(to_replace, str) or not isinstance(
self.dtype, pandas.StringDtype
):
return self
return self._regex_replace(to_replace, value)
elif utils.is_dict_like(to_replace):
raise NotImplementedError(
f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}"
)
return self._mapping_replace(to_replace) # type: ignore
elif utils.is_list_like(to_replace):
block, cond = self._block.apply_unary_op(
self._value_column, ops.IsInOp(to_replace)
)
block, result_col = block.apply_binary_op(
cond,
self._value_column,
ops.partial_arg1(ops.where_op, value),
result_label=self.name,
)
return Series(block.select_column(result_col))
replace_list = to_replace
else: # Scalar
block, cond = self._block.apply_unary_op(
self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace)
replace_list = [to_replace]
replace_list = [
i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype)
]
return self._simple_replace(replace_list, value) if replace_list else self

def _regex_replace(self, to_replace: str, value: str):
if not bigframes.dtypes.is_dtype(value, self.dtype):
raise NotImplementedError(
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
)
block, result_col = block.apply_binary_op(
cond,
self._value_column,
ops.partial_arg1(ops.where_op, value),
result_label=self.name,
block, result_col = self._block.apply_unary_op(
self._value_column,
ops.ReplaceRegexOp(to_replace, value),
result_label=self.name,
)
return Series(block.select_column(result_col))

def _simple_replace(self, to_replace_list: typing.Sequence, value):
if not bigframes.dtypes.is_dtype(value, self.dtype):
raise NotImplementedError(
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
)
return Series(block.select_column(result_col))

block, cond = self._block.apply_unary_op(
self._value_column, ops.IsInOp(to_replace_list)
)
block, result_col = block.apply_binary_op(
cond,
self._value_column,
ops.partial_arg1(ops.where_op, value),
result_label=self.name,
)
return Series(block.select_column(result_col))

def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]):
tuples = []
for key, value in mapping.items():
if not bigframes.dtypes.is_comparable(key, self.dtype):
continue
if not bigframes.dtypes.is_dtype(value, self.dtype):
raise NotImplementedError(
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
)
tuples.append((key, value))

block, result = self._block.apply_unary_op(
self._value_column, ops.MapOp(tuple(tuples))
)
return Series(block.select_column(result))

def interpolate(self, method: str = "linear") -> Series:
if method == "pad":
Expand Down
44 changes: 44 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs):
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_df_replace_scalar_scalar(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas()
pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!")

pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


def test_df_replace_regex_scalar(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas()
pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True)

pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


def test_df_replace_list_scalar(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas()
pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!")

pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


def test_df_replace_value_dict(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas()
pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200})

pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


def test_df_ffill(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()
Expand Down
88 changes: 88 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4356,6 +4356,94 @@ def fillna(self, value):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def replace(
self,
to_replace,
value=None,
*,
regex=False,
):
"""
Replace values given in `to_replace` with `value`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also help add code_samples in the docs as well? Since the person who implements the method knows more about the use cases, and we don't need to do it later in a seperate PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a few code samples


Values of the Series/DataFrame are replaced with other values dynamically.
This differs from updating with ``.loc`` or ``.iloc``, which require
you to specify a location to update with some value.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> df = bpd.DataFrame({
... 'int_col': [1, 1, 2, 3],
... 'string_col': ["a", "b", "c", "b"],
... })

Using scalar `to_replace` and `value`:

>>> df.replace("b", "e")
int_col string_col
0 1 a
1 1 e
2 2 c
3 3 e
<BLANKLINE>
[4 rows x 2 columns]

Using dictionary:

>>> df.replace({"a": "e", 2: 5})
int_col string_col
0 1 e
1 1 b
2 5 c
3 3 b
<BLANKLINE>
[4 rows x 2 columns]

Using regex:

>>> df.replace("[ab]", "e", regex=True)
int_col string_col
0 1 e
1 1 e
2 2 c
3 3 e
<BLANKLINE>
[4 rows x 2 columns]


Args:
to_replace (str, regex, list, int, float or None):
How to find the values that will be replaced.
numeric: numeric values equal to `to_replace` will be replaced with `value`
str: string exactly matching `to_replace` will be replaced with `value`
regex: regexs matching `to_replace` will be replaced with`value`
list of str, regex, or numeric:
First, if `to_replace` and `value` are both lists, they **must** be the same length.
Second, if ``regex=True`` then all of the strings in **both**
lists will be interpreted as regexs otherwise they will match
directly. This doesn't matter much for `value` since there
are only a few possible substitution regexes you can use.
str, regex and numeric rules apply as above.

value (scalar, default None):
Value to replace any values matching `to_replace` with.
For a DataFrame a dict of values can be used to specify which
value to use for each column (columns not in the dict will not be
filled). Regular expressions, strings and lists or dicts of such
objects are also allowed.
regex (bool, default False):
Whether to interpret `to_replace` and/or `value` as regular
expressions. If this is ``True`` then `to_replace` *must* be a
string.

Returns:
Series/DataFrame: Object after replacement.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def iloc(self):
"""Purely integer-location based indexing for selection by position."""
Expand Down