Skip to content

Commit 5f80862

Browse files
feat: add replace method to DataFrame
1 parent 89a1c67 commit 5f80862

File tree

5 files changed

+210
-29
lines changed

5 files changed

+210
-29
lines changed

bigframes/dataframe.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1561,6 +1561,21 @@ def interpolate(self, method: str = "linear") -> DataFrame:
15611561
def fillna(self, value=None) -> DataFrame:
15621562
return self._apply_binop(value, ops.fillna_op, how="left")
15631563

1564+
def replace(
1565+
self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
1566+
):
1567+
if utils.is_dict_like(value):
1568+
return self.apply(
1569+
lambda x: x.replace(
1570+
to_replace=to_replace, value=value[x.name], regex=regex
1571+
)
1572+
if (x.name in value)
1573+
else x
1574+
)
1575+
return self.apply(
1576+
lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)
1577+
)
1578+
15641579
def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
15651580
window = bigframes.core.WindowSpec(preceding=limit, following=0)
15661581
return self._apply_window_op(agg_ops.LastNonNullOp(), window)
@@ -1850,7 +1865,7 @@ def melt(
18501865
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
18511866
)
18521867

1853-
def describe(self) -> DataFrame:
1868+
def describe(self, *, include=None, exclude=None) -> DataFrame:
18541869
df_numeric = self._drop_non_numeric(keep_bool=False)
18551870
if len(df_numeric.columns) == 0:
18561871
raise NotImplementedError(

bigframes/dtypes.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
"""Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
1616

17+
import datetime
1718
import textwrap
1819
import typing
1920
from typing import Any, Dict, Iterable, Literal, Tuple, Union
@@ -437,3 +438,48 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict:
437438
gcb3p_pandas_helpers.bq_to_arrow_data_type(field)
438439
)
439440
return dtypes
441+
442+
443+
def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
444+
"""Captures whether a scalar can be losslessly represented by a dtype."""
445+
if scalar is None:
446+
return True
447+
if pd.api.types.is_bool_dtype(dtype):
448+
return pd.api.types.is_bool(scalar)
449+
if pd.api.types.is_float_dtype(dtype):
450+
return pd.api.types.is_float(scalar)
451+
if pd.api.types.is_int64_dtype(dtype):
452+
return pd.api.types.is_integer(scalar)
453+
if pd.api.types.is_string_dtype(dtype):
454+
return isinstance(scalar, str)
455+
if isinstance(dtype, pd.ArrowDtype):
456+
pa_type = dtype.pyarrow_dtype
457+
return is_patype(scalar, pa_type)
458+
raise TypeError(f"unrecognized type: {dtype}")
459+
460+
461+
def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
462+
if pa_type == pa.time64("us"):
463+
return isinstance(scalar, datetime.time)
464+
if pa_type == pa.timestamp("us"):
465+
if isinstance(scalar, datetime.datetime):
466+
return not scalar.tzinfo
467+
if isinstance(scalar, pd.Timestamp):
468+
return not scalar.tzinfo
469+
if pa_type == pa.timestamp("us", tz="UTC"):
470+
if isinstance(scalar, datetime.datetime):
471+
return scalar.tzinfo == datetime.timezone.utc
472+
if isinstance(scalar, pd.Timestamp):
473+
return scalar.tzinfo == datetime.timezone.utc
474+
if pa_type == pa.date32():
475+
return isinstance(scalar, datetime.date)
476+
raise TypeError(f"unrecognized type: {pa_type}")
477+
478+
479+
def is_comparable(type1: Dtype, type2: Dtype) -> bool:
480+
"""Whether items of type1 and type2 can be compared to each other."""
481+
if type1 == type2:
482+
return True
483+
if (type1 in NUMERIC_BIGFRAMES_TYPES) and (type2 in NUMERIC_BIGFRAMES_TYPES):
484+
return True
485+
return False

bigframes/series.py

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -442,42 +442,53 @@ def replace(
442442
self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
443443
):
444444
if regex:
445-
if not (isinstance(to_replace, str) and isinstance(value, str)):
446-
raise NotImplementedError(
447-
f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}"
448-
)
449-
block, result_col = self._block.apply_unary_op(
450-
self._value_column,
451-
ops.ReplaceRegexOp(to_replace, value),
452-
result_label=self.name,
453-
)
454-
return Series(block.select_column(result_col))
445+
# No-op unless to_replace and series dtype are both string type
446+
if not isinstance(to_replace, str) or not isinstance(
447+
self.dtype, pandas.StringDtype
448+
):
449+
return self
450+
return self._regex_replace(to_replace, value)
455451
elif utils.is_dict_like(to_replace):
456452
raise NotImplementedError(
457453
f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}"
458454
)
459455
elif utils.is_list_like(to_replace):
460-
block, cond = self._block.apply_unary_op(
461-
self._value_column, ops.IsInOp(to_replace)
462-
)
463-
block, result_col = block.apply_binary_op(
464-
cond,
465-
self._value_column,
466-
ops.partial_arg1(ops.where_op, value),
467-
result_label=self.name,
468-
)
469-
return Series(block.select_column(result_col))
456+
replace_list = to_replace
470457
else: # Scalar
471-
block, cond = self._block.apply_unary_op(
472-
self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace)
458+
replace_list = [to_replace]
459+
replace_list = [
460+
i for i in replace_list if bigframes.dtypes.is_dtype(i, self.dtype)
461+
]
462+
return self._simple_replace(replace_list, value) if replace_list else self
463+
464+
def _regex_replace(self, to_replace: str, value: str):
465+
if not bigframes.dtypes.is_dtype(value, self.dtype):
466+
raise NotImplementedError(
467+
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
473468
)
474-
block, result_col = block.apply_binary_op(
475-
cond,
476-
self._value_column,
477-
ops.partial_arg1(ops.where_op, value),
478-
result_label=self.name,
469+
block, result_col = self._block.apply_unary_op(
470+
self._value_column,
471+
ops.ReplaceRegexOp(to_replace, value),
472+
result_label=self.name,
473+
)
474+
return Series(block.select_column(result_col))
475+
476+
def _simple_replace(self, to_replace_list: typing.Sequence, value):
477+
if not bigframes.dtypes.is_dtype(value, self.dtype):
478+
raise NotImplementedError(
479+
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
479480
)
480-
return Series(block.select_column(result_col))
481+
482+
block, cond = self._block.apply_unary_op(
483+
self._value_column, ops.IsInOp(to_replace_list)
484+
)
485+
block, result_col = block.apply_binary_op(
486+
cond,
487+
self._value_column,
488+
ops.partial_arg1(ops.where_op, value),
489+
result_label=self.name,
490+
)
491+
return Series(block.select_column(result_col))
481492

482493
def interpolate(self, method: str = "linear") -> Series:
483494
if method == "pad":

tests/system/small/test_dataframe.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs):
839839
pandas.testing.assert_frame_equal(bf_result, pd_result)
840840

841841

842+
def test_df_replace_scalar_scalar(scalars_dfs):
843+
scalars_df, scalars_pandas_df = scalars_dfs
844+
bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas()
845+
pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!")
846+
847+
pd.testing.assert_frame_equal(
848+
pd_result,
849+
bf_result,
850+
)
851+
852+
853+
def test_df_replace_regex_scalar(scalars_dfs):
854+
scalars_df, scalars_pandas_df = scalars_dfs
855+
bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas()
856+
pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True)
857+
858+
pd.testing.assert_frame_equal(
859+
pd_result,
860+
bf_result,
861+
)
862+
863+
864+
def test_df_replace_list_scalar(scalars_dfs):
865+
scalars_df, scalars_pandas_df = scalars_dfs
866+
bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas()
867+
pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!")
868+
869+
pd.testing.assert_frame_equal(
870+
pd_result,
871+
bf_result,
872+
)
873+
874+
875+
def test_df_replace_value_dict(scalars_dfs):
876+
scalars_df, scalars_pandas_df = scalars_dfs
877+
bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas()
878+
pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200})
879+
880+
pd.testing.assert_frame_equal(
881+
pd_result,
882+
bf_result,
883+
)
884+
885+
842886
def test_df_ffill(scalars_dfs):
843887
scalars_df, scalars_pandas_df = scalars_dfs
844888
bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4006,6 +4006,71 @@ def fillna(self, value):
40064006
"""
40074007
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
40084008

4009+
def replace(
4010+
self,
4011+
to_replace,
4012+
value=None,
4013+
*,
4014+
regex=False,
4015+
):
4016+
"""
4017+
Replace values given in `to_replace` with `value`.
4018+
4019+
Values of the Series/DataFrame are replaced with other values dynamically.
4020+
This differs from updating with ``.loc`` or ``.iloc``, which require
4021+
you to specify a location to update with some value.
4022+
4023+
Args:
4024+
to_replace (str, regex, list, int, float or None):
4025+
How to find the values that will be replaced.
4026+
4027+
* numeric, str or regex:
4028+
4029+
- numeric: numeric values equal to `to_replace` will be
4030+
replaced with `value`
4031+
- str: string exactly matching `to_replace` will be replaced
4032+
with `value`
4033+
- regex: regexs matching `to_replace` will be replaced with
4034+
`value`
4035+
4036+
* list of str, regex, or numeric:
4037+
4038+
- First, if `to_replace` and `value` are both lists, they
4039+
**must** be the same length.
4040+
- Second, if ``regex=True`` then all of the strings in **both**
4041+
lists will be interpreted as regexs otherwise they will match
4042+
directly. This doesn't matter much for `value` since there
4043+
are only a few possible substitution regexes you can use.
4044+
- str, regex and numeric rules apply as above.
4045+
4046+
value (scalar, default None):
4047+
Value to replace any values matching `to_replace` with.
4048+
For a DataFrame a dict of values can be used to specify which
4049+
value to use for each column (columns not in the dict will not be
4050+
filled). Regular expressions, strings and lists or dicts of such
4051+
objects are also allowed.
4052+
regex (bool, default False):
4053+
Whether to interpret `to_replace` and/or `value` as regular
4054+
expressions. If this is ``True`` then `to_replace` *must* be a
4055+
string.
4056+
4057+
Returns:
4058+
Series/DataFrame: Object after replacement.
4059+
4060+
Raises:
4061+
TypeError:
4062+
* If `to_replace` is not a scalar, array-like, ``dict``, or ``None``
4063+
* If `to_replace` is a ``dict`` and `value` is not a ``list``,
4064+
``dict``, ``ndarray``, or ``Series``
4065+
* If `to_replace` is ``None`` and `regex` is not compilable
4066+
into a regular expression or is a list, dict, ndarray, or
4067+
Series.
4068+
* When replacing multiple ``bool`` or ``datetime64`` objects and
4069+
the arguments to `to_replace` does not match the type of the
4070+
value being replaced
4071+
"""
4072+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
4073+
40094074
@property
40104075
def iloc(self):
40114076
"""Purely integer-location based indexing for selection by position."""

0 commit comments

Comments
 (0)