Skip to content

Commit 054075d

Browse files
feat: Add transpose support for small homogeneously typed DataFrames. (#621)
1 parent 518d315 commit 054075d

File tree

4 files changed

+163
-0
lines changed

4 files changed

+163
-0
lines changed

bigframes/core/blocks.py

+51
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import pyarrow as pa
3636

3737
import bigframes._config.sampling_options as sampling_options
38+
import bigframes.constants
3839
import bigframes.constants as constants
3940
import bigframes.core as core
4041
import bigframes.core.expression as ex
@@ -1542,6 +1543,10 @@ def melt(
15421543
var_names=typing.Sequence[typing.Hashable],
15431544
value_name: typing.Hashable = "value",
15441545
):
1546+
"""
1547+
Unpivot columns to produce longer, narrower dataframe.
1548+
Arguments correspond to pandas.melt arguments.
1549+
"""
15451550
# TODO: Implement col_level and ignore_index
15461551
unpivot_col_id = guid.generate_guid()
15471552
var_col_ids = tuple([guid.generate_guid() for _ in var_names])
@@ -1570,6 +1575,52 @@ def melt(
15701575
index_columns=[index_id],
15711576
)
15721577

1578+
def transpose(self) -> Block:
1579+
"""Transpose the block. Will fail if dtypes aren't coercible to a common type or too many rows"""
1580+
original_col_index = self.column_labels
1581+
original_row_index = self.index.to_pandas()
1582+
original_row_count = len(original_row_index)
1583+
if original_row_count > bigframes.constants.MAX_COLUMNS:
1584+
raise NotImplementedError(
1585+
f"Object has {original_row_count} rows and is too large to transpose."
1586+
)
1587+
1588+
# Add row numbers to both axes to disambiguate, clean them up later
1589+
block = self
1590+
numbered_block = block.with_column_labels(
1591+
utils.combine_indices(
1592+
block.column_labels, pd.Index(range(len(block.column_labels)))
1593+
)
1594+
)
1595+
numbered_block, offsets = numbered_block.promote_offsets()
1596+
1597+
stacked_block = numbered_block.melt(
1598+
id_vars=(offsets,),
1599+
var_names=(
1600+
*[name for name in original_col_index.names],
1601+
"col_offset",
1602+
),
1603+
value_vars=block.value_columns,
1604+
)
1605+
col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2]
1606+
col_offset = stacked_block.value_columns[-2] # disambiguator we created earlier
1607+
cell_values = stacked_block.value_columns[-1]
1608+
# Groupby source column
1609+
stacked_block = stacked_block.set_index(
1610+
[*col_labels, col_offset]
1611+
) # col index is now row index
1612+
result = stacked_block.pivot(
1613+
columns=[offsets],
1614+
values=[cell_values],
1615+
columns_unique_values=tuple(range(original_row_count)),
1616+
)
1617+
# Drop the offsets from both axes before returning
1618+
return (
1619+
result.with_column_labels(original_row_index)
1620+
.order_by([ordering.ascending_over(result.index_columns[-1])])
1621+
.drop_levels([result.index_columns[-1]])
1622+
)
1623+
15731624
def _create_stack_column(
15741625
self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]
15751626
):

bigframes/dataframe.py

+7
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,13 @@ def bqclient(self) -> bigframes.Session:
314314
def _session(self) -> bigframes.Session:
315315
return self._get_block().expr.session
316316

317+
@property
318+
def T(self) -> DataFrame:
319+
return DataFrame(self._get_block().transpose())
320+
321+
def transpose(self) -> DataFrame:
322+
return self.T
323+
317324
def __len__(self):
318325
rows, _ = self.shape
319326
return rows

tests/system/small/test_dataframe.py

+23
Original file line numberDiff line numberDiff line change
@@ -2465,6 +2465,29 @@ def test_df_describe(scalars_dfs):
24652465
).all()
24662466

24672467

2468+
def test_df_transpose():
2469+
# Include some floats to ensure type coercion
2470+
values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]]
2471+
# Test complex case of both axes being multi-indices with non-unique elements
2472+
columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow"))
2473+
columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"])
2474+
index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow"))
2475+
rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"])
2476+
2477+
pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi)
2478+
bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi)
2479+
2480+
pd_result = pd_df.T
2481+
bf_result = bf_df.T.to_pandas()
2482+
2483+
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
2484+
2485+
2486+
def test_df_transpose_error():
2487+
with pytest.raises(TypeError, match="Cannot coerce.*to a common type."):
2488+
dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose()
2489+
2490+
24682491
@pytest.mark.parametrize(
24692492
("ordered"),
24702493
[

third_party/bigframes_vendored/pandas/core/frame.py

+82
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,88 @@ def values(self) -> np.ndarray:
9393
"""
9494
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
9595

96+
@property
97+
def T(self) -> DataFrame:
98+
"""
99+
The transpose of the DataFrame.
100+
101+
All columns must be the same dtype (numerics can be coerced to a common supertype).
102+
103+
**Examples:**
104+
105+
>>> import bigframes.pandas as bpd
106+
>>> bpd.options.display.progress_bar = None
107+
>>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
108+
>>> df
109+
col1 col2
110+
0 1 3
111+
1 2 4
112+
<BLANKLINE>
113+
[2 rows x 2 columns]
114+
115+
>>> df.T
116+
0 1
117+
col1 1 2
118+
col2 3 4
119+
<BLANKLINE>
120+
[2 rows x 2 columns]
121+
122+
Returns:
123+
DataFrame: The transposed DataFrame.
124+
"""
125+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
126+
127+
def transpose(self) -> DataFrame:
128+
"""
129+
Transpose index and columns.
130+
131+
Reflect the DataFrame over its main diagonal by writing rows as columns
132+
and vice-versa. The property :attr:`.T` is an accessor to the method
133+
:meth:`transpose`.
134+
135+
All columns must be the same dtype (numerics can be coerced to a common supertype).
136+
137+
**Examples:**
138+
139+
**Square DataFrame with homogeneous dtype**
140+
141+
>>> import bigframes.pandas as bpd
142+
>>> bpd.options.display.progress_bar = None
143+
144+
>>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
145+
>>> df1 = bpd.DataFrame(data=d1)
146+
>>> df1
147+
col1 col2
148+
0 1 3
149+
1 2 4
150+
<BLANKLINE>
151+
[2 rows x 2 columns]
152+
153+
>>> df1_transposed = df1.T # or df1.transpose()
154+
>>> df1_transposed
155+
0 1
156+
col1 1 2
157+
col2 3 4
158+
<BLANKLINE>
159+
[2 rows x 2 columns]
160+
161+
When the dtype is homogeneous in the original DataFrame, we get a
162+
transposed DataFrame with the same dtype:
163+
164+
>>> df1.dtypes
165+
col1 Int64
166+
col2 Int64
167+
dtype: object
168+
>>> df1_transposed.dtypes
169+
0 Int64
170+
1 Int64
171+
dtype: object
172+
173+
Returns:
174+
DataFrame: The transposed DataFrame.
175+
"""
176+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
177+
96178
def info(
97179
self,
98180
verbose: bool | None = None,

0 commit comments

Comments
 (0)