Skip to content

Commit 5edcd19

Browse files
feat: add unstack to series, add level param (#115)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 8e44518 commit 5edcd19

File tree

7 files changed

+112
-48
lines changed

7 files changed

+112
-48
lines changed

bigframes/core/blocks.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@
6767
_MONOTONIC_DECREASING = "monotonic_decreasing"
6868

6969

70+
LevelType = typing.Union[str, int]
71+
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
72+
73+
7074
class BlockHolder(typing.Protocol):
7175
"""Interface for mutable objects with state represented by a block value object."""
7276

@@ -1423,9 +1427,7 @@ def _get_unique_values(
14231427
raise ValueError(f"Too many unique values: {pd_values}")
14241428

14251429
if len(columns) > 1:
1426-
return pd.MultiIndex.from_frame(
1427-
pd_values.sort_values(by=list(pd_values.columns), na_position="first")
1428-
)
1430+
return pd.MultiIndex.from_frame(pd_values)
14291431
else:
14301432
return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first"))
14311433

@@ -1611,6 +1613,24 @@ def cached(self) -> Block:
16111613
index_labels=self.index_labels,
16121614
)
16131615

1616+
def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]:
1617+
if utils.is_list_like(level):
1618+
levels = list(level)
1619+
else:
1620+
levels = [level]
1621+
resolved_level_ids = []
1622+
for level_ref in levels:
1623+
if isinstance(level_ref, int):
1624+
resolved_level_ids.append(self.index_columns[level_ref])
1625+
elif isinstance(level_ref, typing.Hashable):
1626+
matching_ids = self.index_name_to_col_id.get(level_ref, [])
1627+
if len(matching_ids) != 1:
1628+
raise ValueError("level name cannot be found or is ambiguous")
1629+
resolved_level_ids.append(matching_ids[0])
1630+
else:
1631+
raise ValueError(f"Unexpected level: {level_ref}")
1632+
return resolved_level_ids
1633+
16141634
def _is_monotonic(
16151635
self, column_ids: typing.Union[str, Sequence[str]], increasing: bool
16161636
) -> bool:

bigframes/dataframe.py

+11-21
Original file line numberDiff line numberDiff line change
@@ -1038,22 +1038,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
10381038
raise ValueError("Columns must be a multiindex to reorder levels.")
10391039

10401040
def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
1041-
if utils.is_list_like(level):
1042-
levels = list(level)
1043-
else:
1044-
levels = [level]
1045-
resolved_level_ids = []
1046-
for level_ref in levels:
1047-
if isinstance(level_ref, int):
1048-
resolved_level_ids.append(self._block.index_columns[level_ref])
1049-
elif isinstance(level_ref, typing.Hashable):
1050-
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
1051-
if len(matching_ids) != 1:
1052-
raise ValueError("level name cannot be found or is ambiguous")
1053-
resolved_level_ids.append(matching_ids[0])
1054-
else:
1055-
raise ValueError(f"Unexpected level: {level_ref}")
1056-
return resolved_level_ids
1041+
return self._block.resolve_index_level(level)
10571042

10581043
def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame:
10591044
block = self._block.rename(columns=columns)
@@ -1802,20 +1787,25 @@ def _stack_multi(self, level: LevelsType = -1):
18021787
block = block.stack(levels=len(level))
18031788
return DataFrame(block)
18041789

1805-
def unstack(self):
1790+
def unstack(self, level: LevelsType = -1):
1791+
if isinstance(level, int) or isinstance(level, str):
1792+
level = [level]
1793+
18061794
block = self._block
18071795
# Special case, unstack with mono-index transpose into a series
18081796
if self.index.nlevels == 1:
18091797
block = block.stack(how="right", levels=self.columns.nlevels)
18101798
return bigframes.series.Series(block)
18111799

1812-
# Pivot by last level of index
1813-
index_ids = block.index_columns
1800+
# Pivot by index levels
1801+
unstack_ids = self._resolve_levels(level)
18141802
block = block.reset_index(drop=False)
1815-
block = block.set_index(index_ids[:-1])
1803+
block = block.set_index(
1804+
[col for col in self._block.index_columns if col not in unstack_ids]
1805+
)
18161806

18171807
pivot_block = block.pivot(
1818-
columns=[index_ids[-1]],
1808+
columns=unstack_ids,
18191809
values=self._block.value_columns,
18201810
values_in_index=True,
18211811
)

bigframes/series.py

+24-16
Original file line numberDiff line numberDiff line change
@@ -352,22 +352,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
352352
return Series(self._block.reorder_levels(resolved_level_ids))
353353

354354
def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
355-
if _is_list_like(level):
356-
levels = list(level)
357-
else:
358-
levels = [level]
359-
resolved_level_ids = []
360-
for level_ref in levels:
361-
if isinstance(level_ref, int):
362-
resolved_level_ids.append(self._block.index_columns[level_ref])
363-
elif isinstance(level_ref, typing.Hashable):
364-
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
365-
if len(matching_ids) != 1:
366-
raise ValueError("level name cannot be found or is ambiguous")
367-
resolved_level_ids.append(matching_ids[0])
368-
else:
369-
raise ValueError(f"Unexpected level: {level_ref}")
370-
return resolved_level_ids
355+
return self._block.resolve_index_level(level)
371356

372357
def between(self, left, right, inclusive="both"):
373358
if inclusive not in ["both", "neither", "left", "right"]:
@@ -918,6 +903,29 @@ def argmin(self) -> int:
918903
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
919904
)
920905

906+
def unstack(self, level: LevelsType = -1):
907+
if isinstance(level, int) or isinstance(level, str):
908+
level = [level]
909+
910+
block = self._block
911+
912+
if self.index.nlevels == 1:
913+
raise ValueError("Series must have multi-index to unstack")
914+
915+
# Pivot by index levels
916+
unstack_ids = self._resolve_levels(level)
917+
block = block.reset_index(drop=False)
918+
block = block.set_index(
919+
[col for col in self._block.index_columns if col not in unstack_ids]
920+
)
921+
922+
pivot_block = block.pivot(
923+
columns=unstack_ids,
924+
values=self._block.value_columns,
925+
values_in_index=False,
926+
)
927+
return bigframes.dataframe.DataFrame(pivot_block)
928+
921929
def idxmax(self) -> blocks.Label:
922930
block = self._block.order_by(
923931
[

tests/system/conftest.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,11 @@ def hockey_df(
400400
hockey_table_id: str, session: bigframes.Session
401401
) -> bigframes.dataframe.DataFrame:
402402
"""DataFrame pointing at test data."""
403-
return session.read_gbq(hockey_table_id)
403+
return (
404+
session.read_gbq(hockey_table_id)
405+
.set_index(["player_name", "season"])
406+
.sort_index()
407+
)
404408

405409

406410
@pytest.fixture(scope="session")
@@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame:
419423
"season": pd.Int64Dtype(),
420424
},
421425
)
422-
df.index = df.index.astype("Int64")
426+
df = df.set_index(["player_name", "season"]).sort_index()
423427
return df
424428

425429

tests/system/small/test_dataframe.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns):
19491949
],
19501950
)
19511951
def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
1952-
bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas()
1953-
pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns)
1952+
bf_result = (
1953+
hockey_df.reset_index()
1954+
.pivot(values=values, index=index, columns=columns)
1955+
.to_pandas()
1956+
)
1957+
pd_result = hockey_pandas_df.reset_index().pivot(
1958+
values=values, index=index, columns=columns
1959+
)
19541960

19551961
# Pandas produces NaN, where bq dataframes produces pd.NA
19561962
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)

tests/system/small/test_multiindex.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i
909909
pandas.testing.assert_frame_equal(bf_result, pd_result)
910910

911911

912-
def test_multi_index_unstack(hockey_df, hockey_pandas_df):
912+
@pytest.mark.parametrize(
913+
("level",),
914+
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
915+
)
916+
def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level):
913917
bf_result = (
914-
hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas()
918+
hockey_df.set_index(["team_name", "position"], append=True)
919+
.unstack(level=level)
920+
.to_pandas()
915921
)
916922
pd_result = hockey_pandas_df.set_index(
917-
["team_name", "season", "position"]
918-
).unstack()
923+
["team_name", "position"], append=True
924+
).unstack(level=level)
925+
926+
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
927+
928+
929+
@pytest.mark.parametrize(
930+
("level",),
931+
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
932+
)
933+
def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level):
934+
bf_result = (
935+
hockey_df.set_index(["team_name", "position"], append=True)["number"]
936+
.unstack(level=level)
937+
.to_pandas()
938+
)
939+
pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[
940+
"number"
941+
].unstack(level=level)
919942

920943
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
921944

third_party/bigframes_vendored/pandas/core/series.py

+13
Original file line numberDiff line numberDiff line change
@@ -1654,6 +1654,19 @@ def clip(self):
16541654
"""
16551655
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
16561656

1657+
def unstack(self, level):
1658+
"""
1659+
Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
1660+
1661+
Args:
1662+
level (int, str, or list of these, default last level):
1663+
Level(s) to unstack, can pass level name.
1664+
1665+
Returns:
1666+
DataFrame: Unstacked Series.
1667+
"""
1668+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1669+
16571670
def argmax(self):
16581671
"""
16591672
Return int position of the smallest value in the Series.

0 commit comments

Comments
 (0)