feat: Add transpose support for small homogeneously typed DataFrames. (#621)

TrevorBergeron · web-flow · commit 054075d448f7 · 2024-04-22T20:20:09.000-05:00
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -35,6 +35,7 @@
 import pyarrow as pa
 
 import bigframes._config.sampling_options as sampling_options
+import bigframes.constants
 import bigframes.constants as constants
 import bigframes.core as core
 import bigframes.core.expression as ex
@@ -1542,6 +1543,10 @@ def melt(
         var_names=typing.Sequence[typing.Hashable],
         value_name: typing.Hashable = "value",
     ):
+        """
+        Unpivot columns to produce longer, narrower dataframe.
+        Arguments correspond to pandas.melt arguments.
+        """
         # TODO: Implement col_level and ignore_index
         unpivot_col_id = guid.generate_guid()
         var_col_ids = tuple([guid.generate_guid() for _ in var_names])
@@ -1570,6 +1575,52 @@ def melt(
             index_columns=[index_id],
         )
 
+    def transpose(self) -> Block:
+        """Transpose the block. Will fail if dtypes aren't coercible to a common type or too many rows"""
+        original_col_index = self.column_labels
+        original_row_index = self.index.to_pandas()
+        original_row_count = len(original_row_index)
+        if original_row_count > bigframes.constants.MAX_COLUMNS:
+            raise NotImplementedError(
+                f"Object has {original_row_count} rows and is too large to transpose."
+            )
+
+        # Add row numbers to both axes to disambiguate, clean them up later
+        block = self
+        numbered_block = block.with_column_labels(
+            utils.combine_indices(
+                block.column_labels, pd.Index(range(len(block.column_labels)))
+            )
+        )
+        numbered_block, offsets = numbered_block.promote_offsets()
+
+        stacked_block = numbered_block.melt(
+            id_vars=(offsets,),
+            var_names=(
+                *[name for name in original_col_index.names],
+                "col_offset",
+            ),
+            value_vars=block.value_columns,
+        )
+        col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2]
+        col_offset = stacked_block.value_columns[-2]  # disambiguator we created earlier
+        cell_values = stacked_block.value_columns[-1]
+        # Groupby source column
+        stacked_block = stacked_block.set_index(
+            [*col_labels, col_offset]
+        )  # col index is now row index
+        result = stacked_block.pivot(
+            columns=[offsets],
+            values=[cell_values],
+            columns_unique_values=tuple(range(original_row_count)),
+        )
+        # Drop the offsets from both axes before returning
+        return (
+            result.with_column_labels(original_row_index)
+            .order_by([ordering.ascending_over(result.index_columns[-1])])
+            .drop_levels([result.index_columns[-1]])
+        )
+
     def _create_stack_column(
         self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]
     ):
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -314,6 +314,13 @@ def bqclient(self) -> bigframes.Session:
     def _session(self) -> bigframes.Session:
         return self._get_block().expr.session
 
+    @property
+    def T(self) -> DataFrame:
+        return DataFrame(self._get_block().transpose())
+
+    def transpose(self) -> DataFrame:
+        return self.T
+
     def __len__(self):
         rows, _ = self.shape
         return rows
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -2465,6 +2465,29 @@ def test_df_describe(scalars_dfs):
     ).all()
 
 
+def test_df_transpose():
+    # Include some floats to ensure type coercion
+    values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]]
+    # Test complex case of both axes being multi-indices with non-unique elements
+    columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow"))
+    columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"])
+    index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow"))
+    rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"])
+
+    pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi)
+    bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi)
+
+    pd_result = pd_df.T
+    bf_result = bf_df.T.to_pandas()
+
+    pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
+
+
+def test_df_transpose_error():
+    with pytest.raises(TypeError, match="Cannot coerce.*to a common type."):
+        dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose()
+
+
 @pytest.mark.parametrize(
     ("ordered"),
     [
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -93,6 +93,88 @@ def values(self) -> np.ndarray:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    @property
+    def T(self) -> DataFrame:
+        """
+        The transpose of the DataFrame.
+
+        All columns must be the same dtype (numerics can be coerced to a common supertype).
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+            >>> df
+               col1  col2
+            0     1     3
+            1     2     4
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+            >>> df.T
+                  0  1
+            col1  1  2
+            col2  3  4
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+        Returns:
+            DataFrame: The transposed DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def transpose(self) -> DataFrame:
+        """
+        Transpose index and columns.
+
+        Reflect the DataFrame over its main diagonal by writing rows as columns
+        and vice-versa. The property :attr:`.T` is an accessor to the method
+        :meth:`transpose`.
+
+        All columns must be the same dtype (numerics can be coerced to a common supertype).
+
+        **Examples:**
+
+            **Square DataFrame with homogeneous dtype**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
+            >>> df1 = bpd.DataFrame(data=d1)
+            >>> df1
+               col1  col2
+            0     1     3
+            1     2     4
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+            >>> df1_transposed = df1.T  # or df1.transpose()
+            >>> df1_transposed
+                  0  1
+            col1  1  2
+            col2  3  4
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+            When the dtype is homogeneous in the original DataFrame, we get a
+            transposed DataFrame with the same dtype:
+
+            >>> df1.dtypes
+            col1    Int64
+            col2    Int64
+            dtype: object
+            >>> df1_transposed.dtypes
+            0    Int64
+            1    Int64
+            dtype: object
+
+        Returns:
+            DataFrame: The transposed DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def info(
         self,
         verbose: bool | None = None,