Skip to content

Commit c4beafd

Browse files
authored
feat: support dataframe.cov (#498)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Document: https://screenshot.googleplex.com/9egi7MsNj2uWHkH Fixes #<issue_number_goes_here> 🦕
1 parent 97afad9 commit c4beafd

File tree

5 files changed

+101
-11
lines changed

5 files changed

+101
-11
lines changed

bigframes/core/blocks.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -1110,13 +1110,22 @@ def summarize(
11101110
index_columns=[label_col_id],
11111111
)
11121112

1113-
def corr(self):
1114-
"""Returns a block object to compute the self-correlation on this block."""
1113+
def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
1114+
"""
1115+
Returns a block object to compute pairwise metrics among all value columns in this block.
1116+
1117+
The metric to be computed is specified by the `op` parameter, which can be either a
1118+
correlation operation (default) or a covariance operation.
1119+
"""
1120+
if len(self.value_columns) > 30:
1121+
raise NotImplementedError(
1122+
"This function supports dataframes with 30 columns or fewer. "
1123+
f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}"
1124+
)
1125+
11151126
aggregations = [
11161127
(
1117-
ex.BinaryAggregation(
1118-
agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col)
1119-
),
1128+
ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)),
11201129
f"{left_col}-{right_col}",
11211130
)
11221131
for left_col in self.value_columns

bigframes/dataframe.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
10191019
raise NotImplementedError(
10201020
f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
10211021
)
1022-
if len(self.columns) > 30:
1023-
raise NotImplementedError(
1024-
f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
1025-
)
10261022

10271023
if not numeric_only:
10281024
frame = self._raise_on_non_numeric("corr")
10291025
else:
10301026
frame = self._drop_non_numeric()
10311027

1032-
return DataFrame(frame._block.corr())
1028+
return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp()))
1029+
1030+
def cov(self, *, numeric_only: bool = False) -> DataFrame:
1031+
if not numeric_only:
1032+
frame = self._raise_on_non_numeric("corr")
1033+
else:
1034+
frame = self._drop_non_numeric()
1035+
1036+
return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))
10331037

10341038
def to_pandas(
10351039
self,

tests/system/small/test_dataframe.py

+28
Original file line numberDiff line numberDiff line change
@@ -1916,6 +1916,34 @@ def test_corr_w_invalid_parameters(scalars_dfs):
19161916
scalars_df[columns].corr(min_periods=1)
19171917

19181918

1919+
@pytest.mark.parametrize(
1920+
("columns", "numeric_only"),
1921+
[
1922+
(["bool_col", "int64_col", "float64_col"], True),
1923+
(["bool_col", "int64_col", "float64_col"], False),
1924+
(["bool_col", "int64_col", "float64_col", "string_col"], True),
1925+
pytest.param(
1926+
["bool_col", "int64_col", "float64_col", "string_col"],
1927+
False,
1928+
marks=pytest.mark.xfail(
1929+
raises=NotImplementedError,
1930+
),
1931+
),
1932+
],
1933+
)
1934+
def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only):
1935+
scalars_df, scalars_pandas_df = scalars_dfs
1936+
bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas()
1937+
pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only)
1938+
1939+
# BigFrames and Pandas differ in their data type handling:
1940+
# - Column types: BigFrames uses Float64, Pandas uses float64.
1941+
# - Index types: BigFrames uses strign, Pandas uses object.
1942+
pd.testing.assert_frame_equal(
1943+
bf_result, pd_result, check_dtype=False, check_index_type=False
1944+
)
1945+
1946+
19191947
@pytest.mark.parametrize(
19201948
("op"),
19211949
[

tests/system/small/test_multiindex.py

+21
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index):
921921
)
922922

923923

924+
def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index):
925+
columns = ["int64_too", "float64_col", "int64_col"]
926+
multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2]))
927+
928+
bf = scalars_df_index[columns].copy()
929+
bf.columns = multi_columns
930+
931+
pd_df = scalars_pandas_df_index[columns].copy()
932+
pd_df.columns = multi_columns
933+
934+
bf_result = bf.cov(numeric_only=True).to_pandas()
935+
pd_result = pd_df.cov(numeric_only=True)
936+
937+
# BigFrames and Pandas differ in their data type handling:
938+
# - Column types: BigFrames uses Float64, Pandas uses float64.
939+
# - Index types: BigFrames uses string, Pandas uses object.
940+
pandas.testing.assert_frame_equal(
941+
bf_result, pd_result, check_dtype=False, check_index_type=False
942+
)
943+
944+
924945
@pytest.mark.parametrize(
925946
("index_names",),
926947
[

third_party/bigframes_vendored/pandas/core/frame.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -2834,10 +2834,38 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
28342834
Include only float, int, boolean, decimal data.
28352835
28362836
Returns:
2837-
DataFrame: Correlation matrix.
2837+
DataFrame: Correlation matrix.
28382838
"""
28392839
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
28402840

2841+
def cov(self, *, numeric_only) -> DataFrame:
2842+
"""
2843+
Compute pairwise covariance of columns, excluding NA/null values.
2844+
2845+
**Examples:**
2846+
2847+
>>> import bigframes.pandas as bpd
2848+
>>> bpd.options.display.progress_bar = None
2849+
2850+
>>> df = bpd.DataFrame({'A': [1, 2, 3],
2851+
... 'B': [400, 500, 600],
2852+
... 'C': [0.8, 0.4, 0.9]})
2853+
>>> df.cov(numeric_only=True)
2854+
A B C
2855+
A 1.0 100.0 0.05
2856+
B 100.0 10000.0 5.0
2857+
C 0.05 5.0 0.07
2858+
<BLANKLINE>
2859+
[3 rows x 3 columns]
2860+
2861+
Args:
2862+
numeric_only(bool, default False):
2863+
Include only float, int, boolean, decimal data.
2864+
2865+
Returns:
2866+
DataFrame: The covariance matrix of the series of the DataFrame.
2867+
"""
2868+
28412869
def update(
28422870
self, other, join: str = "left", overwrite: bool = True, filter_func=None
28432871
) -> DataFrame:

0 commit comments

Comments
 (0)