Skip to content

Commit 636a209

Browse files
feat: add equals methods to series/dataframe (#76)
* feat: add equals methods to series/dataframe
1 parent ff3bb89 commit 636a209

File tree

7 files changed

+185
-0
lines changed

7 files changed

+185
-0
lines changed

bigframes/core/block_transforms.py

+33
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,39 @@
2525
import bigframes.operations.aggregations as agg_ops
2626

2727

28+
def equals(block1: blocks.Block, block2: blocks.Block) -> bool:
29+
if not block1.column_labels.equals(block2.column_labels):
30+
return False
31+
if block1.dtypes != block2.dtypes:
32+
return False
33+
# TODO: More advanced expression tree traversals to short circuit actually querying data
34+
35+
block1 = block1.reset_index(drop=False)
36+
block2 = block2.reset_index(drop=False)
37+
38+
joined, (lmap, rmap) = block1.index.join(block2.index, how="outer")
39+
joined_block = joined._block
40+
41+
equality_ids = []
42+
for lcol, rcol in zip(block1.value_columns, block2.value_columns):
43+
lcolmapped = lmap(lcol)
44+
rcolmapped = rmap(rcol)
45+
joined_block, result_id = joined_block.apply_binary_op(
46+
lcolmapped, rcolmapped, ops.eq_nulls_match_op
47+
)
48+
joined_block, result_id = joined_block.apply_unary_op(
49+
result_id, ops.partial_right(ops.fillna_op, False)
50+
)
51+
equality_ids.append(result_id)
52+
53+
joined_block = joined_block.select_columns(equality_ids).with_column_labels(
54+
list(range(len(equality_ids)))
55+
)
56+
stacked_block = joined_block.stack(dropna=False, sort=False)
57+
result = stacked_block.get_stat(stacked_block.value_columns[0], agg_ops.all_op)
58+
return typing.cast(bool, result)
59+
60+
2861
def indicate_duplicates(
2962
block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
3063
) -> typing.Tuple[blocks.Block, str]:

bigframes/dataframe.py

+6
Original file line numberDiff line numberDiff line change
@@ -1066,6 +1066,12 @@ def rename_axis(
10661066
labels = [mapper]
10671067
return DataFrame(self._block.with_index_labels(labels))
10681068

1069+
def equals(self, other: typing.Union[bigframes.series.Series, DataFrame]) -> bool:
1070+
# Must be same object type, same column dtypes, and same label values
1071+
if not isinstance(other, DataFrame):
1072+
return False
1073+
return block_ops.equals(self._block, other._block)
1074+
10691075
def assign(self, **kwargs) -> DataFrame:
10701076
# TODO(garrettwu) Support list-like values. Requires ordering.
10711077
# TODO(garrettwu) Support callable values.

bigframes/operations/__init__.py

+10
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,16 @@ def eq_op(
705705
return x == y
706706

707707

708+
def eq_nulls_match_op(
709+
x: ibis_types.Value,
710+
y: ibis_types.Value,
711+
):
712+
"""Variant of eq_op where nulls match each other. Only use where dtypes are known to be same."""
713+
left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$"))
714+
right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$"))
715+
return left == right
716+
717+
708718
def ne_op(
709719
x: ibis_types.Value,
710720
y: ibis_types.Value,

bigframes/series.py

+8
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,14 @@ def rename_axis(
209209
labels = [mapper]
210210
return Series(self._block.with_index_labels(labels))
211211

212+
def equals(
213+
self, other: typing.Union[Series, bigframes.dataframe.DataFrame]
214+
) -> bool:
215+
# Must be same object type, same column dtypes, and same label values
216+
if not isinstance(other, Series):
217+
return False
218+
return block_ops.equals(self._block, other._block)
219+
212220
def reset_index(
213221
self,
214222
*,

tests/system/small/test_dataframe.py

+68
Original file line numberDiff line numberDiff line change
@@ -2551,6 +2551,74 @@ def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index):
25512551
)
25522552

25532553

2554+
def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index):
2555+
unsupported = [
2556+
"geography_col",
2557+
]
2558+
scalars_df_index = scalars_df_index.drop(columns=unsupported)
2559+
scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported)
2560+
2561+
bf_result = scalars_df_index.equals(scalars_df_index)
2562+
pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index)
2563+
2564+
assert pd_result == bf_result
2565+
2566+
2567+
def test_df_equals_series(scalars_df_index, scalars_pandas_df_index):
2568+
bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"])
2569+
pd_result = scalars_pandas_df_index[["int64_col"]].equals(
2570+
scalars_pandas_df_index["int64_col"]
2571+
)
2572+
2573+
assert pd_result == bf_result
2574+
2575+
2576+
def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index):
2577+
columns = ["int64_col", "int64_too"]
2578+
scalars_df_index = scalars_df_index[columns]
2579+
scalars_pandas_df_index = scalars_pandas_df_index[columns]
2580+
2581+
bf_modified = scalars_df_index.copy()
2582+
bf_modified = bf_modified.astype("Float64")
2583+
2584+
pd_modified = scalars_pandas_df_index.copy()
2585+
pd_modified = pd_modified.astype("Float64")
2586+
2587+
bf_result = scalars_df_index.equals(bf_modified)
2588+
pd_result = scalars_pandas_df_index.equals(pd_modified)
2589+
2590+
assert pd_result == bf_result
2591+
2592+
2593+
def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index):
2594+
columns = ["int64_col", "int64_too"]
2595+
scalars_df_index = scalars_df_index[columns]
2596+
scalars_pandas_df_index = scalars_pandas_df_index[columns]
2597+
2598+
bf_modified = scalars_df_index.copy()
2599+
bf_modified["int64_col"] = bf_modified.int64_col + 1
2600+
2601+
pd_modified = scalars_pandas_df_index.copy()
2602+
pd_modified["int64_col"] = pd_modified.int64_col + 1
2603+
2604+
bf_result = scalars_df_index.equals(bf_modified)
2605+
pd_result = scalars_pandas_df_index.equals(pd_modified)
2606+
2607+
assert pd_result == bf_result
2608+
2609+
2610+
def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index):
2611+
columns = ["int64_col", "int64_too"]
2612+
more_columns = ["int64_col", "int64_too", "float64_col"]
2613+
2614+
bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns])
2615+
pd_result = scalars_pandas_df_index[columns].equals(
2616+
scalars_pandas_df_index[more_columns]
2617+
)
2618+
2619+
assert pd_result == bf_result
2620+
2621+
25542622
def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index):
25552623
reindex_target_bf = scalars_df_index.reindex(
25562624
columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1]

tests/system/small/test_series.py

+38
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,44 @@ def test_series_get_column_default(scalars_dfs):
112112
assert result == "default_val"
113113

114114

115+
def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index):
116+
bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col)
117+
pd_result = scalars_pandas_df_index.int64_col.equals(
118+
scalars_pandas_df_index.int64_col
119+
)
120+
121+
assert pd_result == bf_result
122+
123+
124+
def test_series_equals_df(scalars_df_index, scalars_pandas_df_index):
125+
bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]])
126+
pd_result = scalars_pandas_df_index["int64_col"].equals(
127+
scalars_pandas_df_index[["int64_col"]]
128+
)
129+
130+
assert pd_result == bf_result
131+
132+
133+
def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index):
134+
bf_series = scalars_df_index["int64_col"]
135+
pd_series = scalars_pandas_df_index["int64_col"]
136+
137+
bf_result = bf_series.equals(bf_series.astype("Float64"))
138+
pd_result = pd_series.equals(pd_series.astype("Float64"))
139+
140+
assert pd_result == bf_result
141+
142+
143+
def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index):
144+
bf_series = scalars_df_index["int64_col"]
145+
pd_series = scalars_pandas_df_index["int64_col"]
146+
147+
bf_result = bf_series.equals(bf_series + 1)
148+
pd_result = pd_series.equals(pd_series + 1)
149+
150+
assert pd_result == bf_result
151+
152+
115153
def test_series_get_with_default_index(scalars_dfs):
116154
col_name = "float64_col"
117155
key = 2

third_party/bigframes_vendored/pandas/core/frame.py

+22
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,28 @@ def to_orc(self, path=None, **kwargs) -> bytes | None:
399399
# ----------------------------------------------------------------------
400400
# Unsorted
401401

402+
def equals(self, other) -> bool:
403+
"""
404+
Test whether two objects contain the same elements.
405+
406+
This function allows two Series or DataFrames to be compared against
407+
each other to see if they have the same shape and elements. NaNs in
408+
the same location are considered equal.
409+
410+
The row/column index do not need to have the same type, as long
411+
as the values are considered equal. Corresponding columns must be of
412+
the same dtype.
413+
414+
Args:
415+
other (Series or DataFrame):
416+
The other Series or DataFrame to be compared with the first.
417+
418+
Returns:
419+
bool: True if all elements are the same in both objects, False
420+
otherwise.
421+
"""
422+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
423+
402424
def assign(self, **kwargs) -> DataFrame:
403425
r"""
404426
Assign new columns to a DataFrame.

0 commit comments

Comments
 (0)