|
18 | 18 | import bigframes.pandas as bpd
|
19 | 19 | from tests.system.utils import assert_pandas_df_equal
|
20 | 20 |
|
| 21 | +# ================= |
| 22 | +# DataFrame.groupby |
| 23 | +# ================= |
| 24 | + |
21 | 25 |
|
22 | 26 | @pytest.mark.parametrize(
|
23 | 27 | ("operator"),
|
@@ -269,21 +273,26 @@ def test_dataframe_groupby_analytic(
|
269 | 273 | pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
|
270 | 274 |
|
271 | 275 |
|
272 |
| -def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): |
273 |
| - bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas() |
274 |
| - pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew() |
| 276 | +def test_dataframe_groupby_size_as_index_false( |
| 277 | + scalars_df_index, scalars_pandas_df_index |
| 278 | +): |
| 279 | + bf_result = scalars_df_index.groupby("string_col", as_index=False).size() |
| 280 | + bf_result_computed = bf_result.to_pandas() |
| 281 | + pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size() |
275 | 282 |
|
276 |
| - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) |
| 283 | + pd.testing.assert_frame_equal( |
| 284 | + pd_result, bf_result_computed, check_dtype=False, check_index_type=False |
| 285 | + ) |
277 | 286 |
|
278 | 287 |
|
279 |
| -def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): |
280 |
| - bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas() |
281 |
| - # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 |
282 |
| - pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( |
283 |
| - pd.Series.kurt |
284 |
| - ) |
| 288 | +def test_dataframe_groupby_size_as_index_true( |
| 289 | + scalars_df_index, scalars_pandas_df_index |
| 290 | +): |
| 291 | + bf_result = scalars_df_index.groupby("string_col", as_index=True).size() |
| 292 | + pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size() |
| 293 | + bf_result_computed = bf_result.to_pandas() |
285 | 294 |
|
286 |
| - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) |
| 295 | + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) |
287 | 296 |
|
288 | 297 |
|
289 | 298 | def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index):
|
@@ -356,6 +365,30 @@ def test_dataframe_groupby_getitem_list(
|
356 | 365 | pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
|
357 | 366 |
|
358 | 367 |
|
| 368 | +def test_dataframe_groupby_nonnumeric_with_mean(): |
| 369 | + df = pd.DataFrame( |
| 370 | + { |
| 371 | + "key1": ["a", "a", "a", "b"], |
| 372 | + "key2": ["a", "a", "c", "c"], |
| 373 | + "key3": [1, 2, 3, 4], |
| 374 | + "key4": [1.6, 2, 3, 4], |
| 375 | + } |
| 376 | + ) |
| 377 | + pd_result = df.groupby(["key1", "key2"]).mean() |
| 378 | + |
| 379 | + with bpd.option_context("bigquery.location", "US"): |
| 380 | + bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() |
| 381 | + |
| 382 | + pd.testing.assert_frame_equal( |
| 383 | + pd_result, bf_result, check_index_type=False, check_dtype=False |
| 384 | + ) |
| 385 | + |
| 386 | + |
| 387 | +# ============== |
| 388 | +# Series.groupby |
| 389 | +# ============== |
| 390 | + |
| 391 | + |
359 | 392 | def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index):
|
360 | 393 | bf_result = (
|
361 | 394 | scalars_df_index["int64_col"]
|
@@ -392,21 +425,49 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
|
392 | 425 | )
|
393 | 426 |
|
394 | 427 |
|
395 |
| -def test_dataframe_groupby_nonnumeric_with_mean(): |
396 |
| - df = pd.DataFrame( |
397 |
| - { |
398 |
| - "key1": ["a", "a", "a", "b"], |
399 |
| - "key2": ["a", "a", "c", "c"], |
400 |
| - "key3": [1, 2, 3, 4], |
401 |
| - "key4": [1.6, 2, 3, 4], |
402 |
| - } |
| 428 | +def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): |
| 429 | + bf_result = ( |
| 430 | + scalars_df_index["int64_too"] |
| 431 | + .groupby(scalars_df_index["bool_col"]) |
| 432 | + .kurt() |
| 433 | + .to_pandas() |
| 434 | + ) |
| 435 | + # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 |
| 436 | + pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( |
| 437 | + pd.Series.kurt |
403 | 438 | )
|
404 |
| - pd_result = df.groupby(["key1", "key2"]).mean() |
405 |
| - bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() |
406 | 439 |
|
407 |
| - pd.testing.assert_frame_equal( |
408 |
| - pd_result, bf_result, check_index_type=False, check_dtype=False |
| 440 | + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) |
| 441 | + |
| 442 | + |
| 443 | +def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index): |
| 444 | + bf_result = ( |
| 445 | + scalars_df_index["int64_too"].groupby(scalars_df_index["bool_col"]).size() |
409 | 446 | )
|
| 447 | + pd_result = ( |
| 448 | + scalars_pandas_df_index["int64_too"] |
| 449 | + .groupby(scalars_pandas_df_index["bool_col"]) |
| 450 | + .size() |
| 451 | + ) |
| 452 | + bf_result_computed = bf_result.to_pandas() |
| 453 | + |
| 454 | + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) |
| 455 | + |
| 456 | + |
| 457 | +def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): |
| 458 | + bf_result = ( |
| 459 | + scalars_df_index["int64_too"] |
| 460 | + .groupby(scalars_df_index["bool_col"]) |
| 461 | + .skew() |
| 462 | + .to_pandas() |
| 463 | + ) |
| 464 | + pd_result = ( |
| 465 | + scalars_pandas_df_index["int64_too"] |
| 466 | + .groupby(scalars_pandas_df_index["bool_col"]) |
| 467 | + .skew() |
| 468 | + ) |
| 469 | + |
| 470 | + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) |
410 | 471 |
|
411 | 472 |
|
412 | 473 | @pytest.mark.parametrize(
|
|
0 commit comments