feat: add equals methods to series/dataframe (#76)

TrevorBergeron · web-flow · commit 636a209e0853 · 2023-10-05T14:09:14.000-07:00
* feat: add equals methods to series/dataframe
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -25,6 +25,39 @@
 import bigframes.operations.aggregations as agg_ops
 
 
+def equals(block1: blocks.Block, block2: blocks.Block) -> bool:
+    if not block1.column_labels.equals(block2.column_labels):
+        return False
+    if block1.dtypes != block2.dtypes:
+        return False
+    # TODO: More advanced expression tree traversals to short circuit actually querying data
+
+    block1 = block1.reset_index(drop=False)
+    block2 = block2.reset_index(drop=False)
+
+    joined, (lmap, rmap) = block1.index.join(block2.index, how="outer")
+    joined_block = joined._block
+
+    equality_ids = []
+    for lcol, rcol in zip(block1.value_columns, block2.value_columns):
+        lcolmapped = lmap(lcol)
+        rcolmapped = rmap(rcol)
+        joined_block, result_id = joined_block.apply_binary_op(
+            lcolmapped, rcolmapped, ops.eq_nulls_match_op
+        )
+        joined_block, result_id = joined_block.apply_unary_op(
+            result_id, ops.partial_right(ops.fillna_op, False)
+        )
+        equality_ids.append(result_id)
+
+    joined_block = joined_block.select_columns(equality_ids).with_column_labels(
+        list(range(len(equality_ids)))
+    )
+    stacked_block = joined_block.stack(dropna=False, sort=False)
+    result = stacked_block.get_stat(stacked_block.value_columns[0], agg_ops.all_op)
+    return typing.cast(bool, result)
+
+
 def indicate_duplicates(
     block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
 ) -> typing.Tuple[blocks.Block, str]:
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1066,6 +1066,12 @@ def rename_axis(
             labels = [mapper]
         return DataFrame(self._block.with_index_labels(labels))
 
+    def equals(self, other: typing.Union[bigframes.series.Series, DataFrame]) -> bool:
+        # Must be same object type, same column dtypes, and same label values
+        if not isinstance(other, DataFrame):
+            return False
+        return block_ops.equals(self._block, other._block)
+
     def assign(self, **kwargs) -> DataFrame:
         # TODO(garrettwu) Support list-like values. Requires ordering.
         # TODO(garrettwu) Support callable values.
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -705,6 +705,16 @@ def eq_op(
     return x == y
 
 
+def eq_nulls_match_op(
+    x: ibis_types.Value,
+    y: ibis_types.Value,
+):
+    """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same."""
+    left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$"))
+    right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$"))
+    return left == right
+
+
 def ne_op(
     x: ibis_types.Value,
     y: ibis_types.Value,
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -209,6 +209,14 @@ def rename_axis(
             labels = [mapper]
         return Series(self._block.with_index_labels(labels))
 
+    def equals(
+        self, other: typing.Union[Series, bigframes.dataframe.DataFrame]
+    ) -> bool:
+        # Must be same object type, same column dtypes, and same label values
+        if not isinstance(other, Series):
+            return False
+        return block_ops.equals(self._block, other._block)
+
     def reset_index(
         self,
         *,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -2551,6 +2551,74 @@ def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index):
     )
 
 
+def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index):
+    unsupported = [
+        "geography_col",
+    ]
+    scalars_df_index = scalars_df_index.drop(columns=unsupported)
+    scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported)
+
+    bf_result = scalars_df_index.equals(scalars_df_index)
+    pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index)
+
+    assert pd_result == bf_result
+
+
+def test_df_equals_series(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"])
+    pd_result = scalars_pandas_df_index[["int64_col"]].equals(
+        scalars_pandas_df_index["int64_col"]
+    )
+
+    assert pd_result == bf_result
+
+
+def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index):
+    columns = ["int64_col", "int64_too"]
+    scalars_df_index = scalars_df_index[columns]
+    scalars_pandas_df_index = scalars_pandas_df_index[columns]
+
+    bf_modified = scalars_df_index.copy()
+    bf_modified = bf_modified.astype("Float64")
+
+    pd_modified = scalars_pandas_df_index.copy()
+    pd_modified = pd_modified.astype("Float64")
+
+    bf_result = scalars_df_index.equals(bf_modified)
+    pd_result = scalars_pandas_df_index.equals(pd_modified)
+
+    assert pd_result == bf_result
+
+
+def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index):
+    columns = ["int64_col", "int64_too"]
+    scalars_df_index = scalars_df_index[columns]
+    scalars_pandas_df_index = scalars_pandas_df_index[columns]
+
+    bf_modified = scalars_df_index.copy()
+    bf_modified["int64_col"] = bf_modified.int64_col + 1
+
+    pd_modified = scalars_pandas_df_index.copy()
+    pd_modified["int64_col"] = pd_modified.int64_col + 1
+
+    bf_result = scalars_df_index.equals(bf_modified)
+    pd_result = scalars_pandas_df_index.equals(pd_modified)
+
+    assert pd_result == bf_result
+
+
+def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index):
+    columns = ["int64_col", "int64_too"]
+    more_columns = ["int64_col", "int64_too", "float64_col"]
+
+    bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns])
+    pd_result = scalars_pandas_df_index[columns].equals(
+        scalars_pandas_df_index[more_columns]
+    )
+
+    assert pd_result == bf_result
+
+
 def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index):
     reindex_target_bf = scalars_df_index.reindex(
         columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1]
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -112,6 +112,44 @@ def test_series_get_column_default(scalars_dfs):
     assert result == "default_val"
 
 
+def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col)
+    pd_result = scalars_pandas_df_index.int64_col.equals(
+        scalars_pandas_df_index.int64_col
+    )
+
+    assert pd_result == bf_result
+
+
+def test_series_equals_df(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]])
+    pd_result = scalars_pandas_df_index["int64_col"].equals(
+        scalars_pandas_df_index[["int64_col"]]
+    )
+
+    assert pd_result == bf_result
+
+
+def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index):
+    bf_series = scalars_df_index["int64_col"]
+    pd_series = scalars_pandas_df_index["int64_col"]
+
+    bf_result = bf_series.equals(bf_series.astype("Float64"))
+    pd_result = pd_series.equals(pd_series.astype("Float64"))
+
+    assert pd_result == bf_result
+
+
+def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index):
+    bf_series = scalars_df_index["int64_col"]
+    pd_series = scalars_pandas_df_index["int64_col"]
+
+    bf_result = bf_series.equals(bf_series + 1)
+    pd_result = pd_series.equals(pd_series + 1)
+
+    assert pd_result == bf_result
+
+
 def test_series_get_with_default_index(scalars_dfs):
     col_name = "float64_col"
     key = 2
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -399,6 +399,28 @@ def to_orc(self, path=None, **kwargs) -> bytes | None:
     # ----------------------------------------------------------------------
     # Unsorted
 
+    def equals(self, other) -> bool:
+        """
+        Test whether two objects contain the same elements.
+
+        This function allows two Series or DataFrames to be compared against
+        each other to see if they have the same shape and elements. NaNs in
+        the same location are considered equal.
+
+        The row/column index do not need to have the same type, as long
+        as the values are considered equal. Corresponding columns must be of
+        the same dtype.
+
+        Args:
+            other (Series or DataFrame):
+                The other Series or DataFrame to be compared with the first.
+
+        Returns:
+            bool: True if all elements are the same in both objects, False
+            otherwise.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def assign(self, **kwargs) -> DataFrame:
         r"""
         Assign new columns to a DataFrame.