diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index ae6011ffa5..0a95adcd83 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -13,7 +13,9 @@ # limitations under the License. from bigframes.core.indexes.base import Index +from bigframes.core.indexes.multi import MultiIndex __all__ = [ "Index", + "MultiIndex", ] diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index daa52a02b9..46a9e30637 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -42,9 +42,15 @@ class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ - - def __init__( - self, + _query_job = None + _block: blocks.Block + _linked_frame: Union[ + bigframes.dataframe.DataFrame, bigframes.series.Series, None + ] = None + + # Overrided on __new__ to create subclasses like pandas does + def __new__( + cls, data=None, dtype=None, *, @@ -73,18 +79,30 @@ def __init__( if dtype is not None: index = index.astype(dtype) block = index._block + elif isinstance(data, pandas.Index): + pd_df = pandas.DataFrame(index=data) + block = df.DataFrame(pd_df, session=session)._block else: pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df, session=session)._block - self._query_job = None - self._block: blocks.Block = block + + # TODO: Support more index subtypes + from bigframes.core.indexes.multi import MultiIndex + + klass = MultiIndex if len(block._index_columns) > 1 else cls + result = typing.cast(Index, object.__new__(klass)) + result._query_job = None + result._block = block + return result @classmethod def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return FrameIndex(frame) + index = Index(frame._block) + index._linked_frame = frame + return index @property def name(self) -> blocks.Label: @@ -107,6 +125,10 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): new_block = self._block.with_index_labels(values) + if self._linked_frame is not None: + self._linked_frame._set_block( + self._linked_frame._block.with_index_labels(values) + ) self._block = new_block @property @@ -452,26 +474,3 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] - - -# Index that mutates the originating dataframe/series -class FrameIndex(Index): - def __init__( - self, - series_or_dataframe: typing.Union[ - bigframes.series.Series, bigframes.dataframe.DataFrame - ], - ): - super().__init__(series_or_dataframe._block) - self._whole_frame = series_or_dataframe - - @property - def names(self) -> typing.Sequence[blocks.Label]: - """Returns the names of the Index.""" - return self._block._index_labels - - @names.setter - def names(self, values: typing.Sequence[blocks.Label]): - new_block = self._whole_frame._get_block().with_index_labels(values) - self._whole_frame._set_block(new_block) - self._block = new_block diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py new file mode 100644 index 0000000000..182d1f101c --- /dev/null +++ b/bigframes/core/indexes/multi.py @@ -0,0 +1,48 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast, Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex +import pandas + +from bigframes.core.indexes.base import Index + + +class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): + __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) + # Index.__new__ should detect multiple levels and properly create a multiindex + return cast(MultiIndex, Index(pd_index)) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 4b0ac4310c..f5be4421e4 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -707,6 +707,7 @@ def to_datetime( # checking and docstrings. DataFrame = bigframes.dataframe.DataFrame Index = bigframes.core.indexes.Index +MultiIndex = bigframes.core.indexes.MultiIndex Series = bigframes.series.Series # Other public pandas attributes @@ -760,6 +761,7 @@ def to_datetime( # Class aliases "DataFrame", "Index", + "MultiIndex", "Series", # Other public pandas attributes "NamedAgg", diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 330fe44eb8..bb0af52976 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -20,6 +20,31 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_multi_index_from_arrays(): + bf_idx = bpd.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + pd_idx = pandas.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + assert bf_idx.names == pd_idx.names + pandas.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + @skip_legacy_pandas def test_read_pandas_multi_index_axes(): index = pandas.MultiIndex.from_arrays( diff --git a/third_party/bigframes_vendored/pandas/core/indexes/multi.py b/third_party/bigframes_vendored/pandas/core/indexes/multi.py new file mode 100644 index 0000000000..a882aa40e3 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/indexes/multi.py @@ -0,0 +1,88 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/multi.py +from __future__ import annotations + +from typing import Hashable, Iterable, Sequence + +import bigframes_vendored.pandas.core.indexes.base + +from bigframes import constants + + +class MultiIndex(bigframes_vendored.pandas.core.indexes.base.Index): + """ + A multi-level, or hierarchical, index object for pandas objects. + """ + + @classmethod + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + """ + Convert list of tuples to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> bpd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + tuples (list / sequence of tuple-likes): + Each tuple is the index of one row/column. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names=None, + ) -> MultiIndex: + """ + Convert arrays to MultiIndex. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> bpd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + Args: + arrays (list / sequence of array-likes): + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder (int or None): + Level of sortedness (must be lexicographically sorted by that + level). + names (list / sequence of str, optional): + Names for the levels in the index. + + Returns: + MultiIndex + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)