Skip to content

Commit ef76f13

Browse files
feat: allow access of struct fields with dot operators on Series (#1019)
* feat: allow access of struct fields with dot operators for Series * fix infinite recursion of __getattr__() * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix typing and version * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 2d18815 commit ef76f13

File tree

5 files changed

+148
-2
lines changed

5 files changed

+148
-2
lines changed

bigframes/series.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,15 @@
2222
import numbers
2323
import textwrap
2424
import typing
25-
from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union
25+
from typing import Any, cast, List, Literal, Mapping, Optional, Sequence, Tuple, Union
2626

2727
import bigframes_vendored.constants as constants
2828
import bigframes_vendored.pandas.core.series as vendored_pandas_series
2929
import google.cloud.bigquery as bigquery
3030
import numpy
3131
import pandas
3232
import pandas.core.dtypes.common
33+
import pyarrow as pa
3334
import typing_extensions
3435

3536
import bigframes.core
@@ -181,6 +182,14 @@ def _info_axis(self) -> indexes.Index:
181182
def _session(self) -> bigframes.Session:
182183
return self._get_block().expr.session
183184

185+
@property
186+
def _struct_fields(self) -> List[str]:
187+
if not bigframes.dtypes.is_struct_like(self._dtype):
188+
return []
189+
190+
struct_type = typing.cast(pa.StructType, self._dtype.pyarrow_dtype)
191+
return [struct_type.field(i).name for i in range(struct_type.num_fields)]
192+
184193
@validations.requires_ordering()
185194
def transpose(self) -> Series:
186195
return self
@@ -1096,6 +1105,9 @@ def __pos__(self) -> Series:
10961105
def __neg__(self) -> Series:
10971106
return self._apply_unary_op(ops.neg_op)
10981107

1108+
def __dir__(self) -> List[str]:
1109+
return dir(type(self)) + self._struct_fields
1110+
10991111
def eq(self, other: object) -> Series:
11001112
# TODO: enforce stricter alignment
11011113
return self._apply_binary_op(other, ops.eq_op)
@@ -1240,7 +1252,15 @@ def __getitem__(self, indexer):
12401252
__getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__)
12411253

12421254
def __getattr__(self, key: str):
1243-
if hasattr(pandas.Series, key):
1255+
# Protect against recursion errors with uninitialized Series objects.
1256+
# We use "_block" attribute to check whether the instance is initialized.
1257+
# See:
1258+
# https://github.com/googleapis/python-bigquery-dataframes/issues/728
1259+
# and
1260+
# https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html
1261+
if key == "_block":
1262+
raise AttributeError(key)
1263+
elif hasattr(pandas.Series, key):
12441264
raise AttributeError(
12451265
textwrap.dedent(
12461266
f"""
@@ -1249,6 +1269,8 @@ def __getattr__(self, key: str):
12491269
"""
12501270
)
12511271
)
1272+
elif key in self._struct_fields:
1273+
return self.struct.field(key)
12521274
else:
12531275
raise AttributeError(key)
12541276

tests/data/nested_structs.jsonl

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}}
2+
{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}}

tests/data/nested_structs_schema.json

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
[
2+
{
3+
"name": "id",
4+
"type": "INTEGER",
5+
"mode": "REQUIRED"
6+
},
7+
{
8+
"name": "person",
9+
"type": "RECORD",
10+
"fields": [
11+
{
12+
"name": "name",
13+
"type": "STRING",
14+
"mode": "NULLABLE"
15+
},
16+
{
17+
"name": "age",
18+
"type": "INTEGER",
19+
"mode": "NULLABLE"
20+
},
21+
{
22+
"name": "address",
23+
"type": "RECORD",
24+
"fields": [
25+
{
26+
"name": "city",
27+
"type": "STRING",
28+
"mode": "NULLABLE"
29+
},
30+
{
31+
"name": "country",
32+
"type": "STRING",
33+
"mode": "NULLABLE"
34+
}
35+
]
36+
}
37+
]
38+
}
39+
]

tests/system/conftest.py

+44
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import ibis.backends
3333
import numpy as np
3434
import pandas as pd
35+
import pyarrow as pa
3536
import pytest
3637
import pytz
3738
import test_utils.prefixer
@@ -290,6 +291,7 @@ def load_test_data_tables(
290291
("scalars", "scalars_schema.json", "scalars.jsonl"),
291292
("scalars_too", "scalars_schema.json", "scalars.jsonl"),
292293
("nested", "nested_schema.json", "nested.jsonl"),
294+
("nested_structs", "nested_structs_schema.json", "nested_structs.jsonl"),
293295
("repeated", "repeated_schema.json", "repeated.jsonl"),
294296
("penguins", "penguins_schema.json", "penguins.jsonl"),
295297
("time_series", "time_series_schema.json", "time_series.jsonl"),
@@ -367,6 +369,11 @@ def nested_table_id(test_data_tables) -> str:
367369
return test_data_tables["nested"]
368370

369371

372+
@pytest.fixture(scope="session")
373+
def nested_structs_table_id(test_data_tables) -> str:
374+
return test_data_tables["nested_structs"]
375+
376+
370377
@pytest.fixture(scope="session")
371378
def repeated_table_id(test_data_tables) -> str:
372379
return test_data_tables["repeated"]
@@ -412,6 +419,43 @@ def nested_pandas_df() -> pd.DataFrame:
412419
return df
413420

414421

422+
@pytest.fixture(scope="session")
423+
def nested_structs_df(
424+
nested_structs_table_id: str, session: bigframes.Session
425+
) -> bigframes.dataframe.DataFrame:
426+
"""DataFrame pointing at test data."""
427+
return session.read_gbq(nested_structs_table_id, index_col="id")
428+
429+
430+
@pytest.fixture(scope="session")
431+
def nested_structs_pandas_df() -> pd.DataFrame:
432+
"""pd.DataFrame pointing at test data."""
433+
434+
df = pd.read_json(
435+
DATA_DIR / "nested_structs.jsonl",
436+
lines=True,
437+
)
438+
df = df.set_index("id")
439+
return df
440+
441+
442+
@pytest.fixture(scope="session")
443+
def nested_structs_pandas_type() -> pd.ArrowDtype:
444+
address_struct_schema = pa.struct(
445+
[pa.field("city", pa.string()), pa.field("country", pa.string())]
446+
)
447+
448+
person_struct_schema = pa.struct(
449+
[
450+
pa.field("name", pa.string()),
451+
pa.field("age", pa.int64()),
452+
pa.field("address", address_struct_schema),
453+
]
454+
)
455+
456+
return pd.ArrowDtype(person_struct_schema)
457+
458+
415459
@pytest.fixture(scope="session")
416460
def repeated_df(
417461
repeated_table_id: str, session: bigframes.Session

tests/system/small/test_series.py

+39
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import geopandas as gpd # type: ignore
2121
import numpy
22+
from packaging.version import Version
2223
import pandas as pd
2324
import pyarrow as pa # type: ignore
2425
import pytest
@@ -3912,3 +3913,41 @@ def test_series_explode_null(data):
39123913
s.to_pandas().explode(),
39133914
check_dtype=False,
39143915
)
3916+
3917+
3918+
def test_series_struct_get_field_by_attribute(
3919+
nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type
3920+
):
3921+
if Version(pd.__version__) < Version("2.2.0"):
3922+
pytest.skip("struct accessor is not supported before pandas 2.2")
3923+
3924+
bf_series = nested_structs_df["person"]
3925+
df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type)
3926+
3927+
pd.testing.assert_series_equal(
3928+
bf_series.address.city.to_pandas(),
3929+
df_series.struct.field("address").struct.field("city"),
3930+
check_dtype=False,
3931+
check_index=False,
3932+
)
3933+
pd.testing.assert_series_equal(
3934+
bf_series.address.country.to_pandas(),
3935+
df_series.struct.field("address").struct.field("country"),
3936+
check_dtype=False,
3937+
check_index=False,
3938+
)
3939+
3940+
3941+
def test_series_struct_fields_in_dir(nested_structs_df):
3942+
series = nested_structs_df["person"]
3943+
3944+
assert "age" in dir(series)
3945+
assert "address" in dir(series)
3946+
assert "city" in dir(series.address)
3947+
assert "country" in dir(series.address)
3948+
3949+
3950+
def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df):
3951+
series = nested_structs_df["person"]
3952+
3953+
assert series.name == "person"

0 commit comments

Comments
 (0)