Skip to content

Commit 7dd304c

Browse files
mattyoplMatthew Laurence Chen
and
Matthew Laurence Chen
authored
feat: recover struct column from exploded Series (#904)
* feat: recover struct columns from exploded Series Fixes #357588049 internal 🦕 --------- Co-authored-by: Matthew Laurence Chen <[email protected]>
1 parent 46f2dd7 commit 7dd304c

File tree

4 files changed

+129
-0
lines changed

4 files changed

+129
-0
lines changed

bigframes/bigquery/__init__.py

+34
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,40 @@ def json_extract_array(
271271
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
272272

273273

274+
def struct(value: dataframe.DataFrame) -> series.Series:
275+
"""Takes a DataFrame and converts it into a Series of structs with each
276+
struct entry corresponding to a DataFrame row and each struct field
277+
corresponding to a DataFrame column
278+
279+
**Examples:**
280+
281+
>>> import bigframes.pandas as bpd
282+
>>> import bigframes.bigquery as bbq
283+
>>> import bigframes.series as series
284+
>>> bpd.options.display.progress_bar = None
285+
286+
>>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},])
287+
>>> df = srs.struct.explode()
288+
>>> bbq.struct(df)
289+
0 {'project': 'pandas', 'version': 1}
290+
1 {'project': 'numpy', 'version': 2}
291+
dtype: struct<project: string, version: int64>[pyarrow]
292+
293+
Args:
294+
value (bigframes.dataframe.DataFrame):
295+
The DataFrame to be converted to a Series of structs
296+
297+
Returns:
298+
bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
299+
"""
300+
block = value._block
301+
block, result_id = block.apply_nary_op(
302+
block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
303+
)
304+
block = block.select_column(result_id)
305+
return bigframes.series.Series(block)
306+
307+
274308
# Search functions defined from
275309
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
276310

bigframes/core/compile/scalar_op_compiler.py

+11
Original file line numberDiff line numberDiff line change
@@ -1539,6 +1539,17 @@ def nary_remote_function_op_impl(
15391539
return result
15401540

15411541

1542+
@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True)
1543+
def struct_op_impl(
1544+
*values: ibis_types.Value, op: ops.StructOp
1545+
) -> ibis_types.StructValue:
1546+
data = {}
1547+
for i, value in enumerate(values):
1548+
data[op.column_names[i]] = value
1549+
1550+
return ibis.struct(data)
1551+
1552+
15421553
# Helpers
15431554
def is_null(value) -> bool:
15441555
# float NaN/inf should be treated as distinct from 'true' null values

bigframes/operations/__init__.py

+23
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
867867
case_when_op = CaseWhenOp()
868868

869869

870+
@dataclasses.dataclass(frozen=True)
871+
class StructOp(NaryOp):
872+
name: typing.ClassVar[str] = "struct"
873+
column_names: tuple[str]
874+
875+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
876+
num_input_types = len(input_types)
877+
# value1, value2, ...
878+
assert num_input_types == len(self.column_names)
879+
fields = []
880+
881+
for i in range(num_input_types):
882+
fields.append(
883+
(
884+
self.column_names[i],
885+
dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]),
886+
)
887+
)
888+
return pd.ArrowDtype(
889+
pa.struct(fields)
890+
) # [(name1, value1), (name2, value2), ...]
891+
892+
870893
# Just parameterless unary ops for now
871894
# TODO: Parameter mappings
872895
NUMPY_TO_OP: typing.Final = {
+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pandas as pd
16+
import pytest
17+
18+
import bigframes.bigquery as bbq
19+
import bigframes.series as series
20+
21+
22+
@pytest.mark.parametrize(
23+
"columns_arg",
24+
[
25+
[
26+
{"version": 1, "project": "pandas"},
27+
{"version": 2, "project": "pandas"},
28+
{"version": 1, "project": "numpy"},
29+
],
30+
[
31+
{"version": 1, "project": "pandas"},
32+
{"version": None, "project": "pandas"},
33+
{"version": 1, "project": "numpy"},
34+
],
35+
[
36+
{"array": [6, 4, 6], "project": "pandas"},
37+
{"array": [6, 4, 7, 6], "project": "pandas"},
38+
{"array": [7, 2, 3], "project": "numpy"},
39+
],
40+
[
41+
{"array": [6, 4, 6], "project": "pandas"},
42+
{"array": [6, 4, 7, 6], "project": "pandas"},
43+
{"array": [7, 2, 3], "project": "numpy"},
44+
],
45+
[
46+
{"struct": [{"x": 2, "y": 4}], "project": "pandas"},
47+
{"struct": [{"x": 9, "y": 3}], "project": "pandas"},
48+
{"struct": [{"x": 1, "y": 2}], "project": "numpy"},
49+
],
50+
],
51+
)
52+
def test_struct_from_dataframe(columns_arg):
53+
srs = series.Series(
54+
columns_arg,
55+
)
56+
pd.testing.assert_series_equal(
57+
srs.to_pandas(),
58+
bbq.struct(srs.struct.explode()).to_pandas(),
59+
check_index_type=False,
60+
check_dtype=False,
61+
)

0 commit comments

Comments
 (0)