Skip to content

Commit 8eca99a

Browse files
fix: Fix case where df.peek would fail to execute even with force=True (#511)
* fix: Fix case where df.peek would fail to execute even with force=True * remove cache from peekable property * if force=True always peek after caching even if peeking inefficient
1 parent f798277 commit 8eca99a

File tree

6 files changed

+33
-42
lines changed

6 files changed

+33
-42
lines changed

bigframes/core/blocks.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import bigframes.core.guid as guid
4242
import bigframes.core.join_def as join_defs
4343
import bigframes.core.ordering as ordering
44+
import bigframes.core.tree_properties as tree_properties
4445
import bigframes.core.utils
4546
import bigframes.core.utils as utils
4647
import bigframes.dtypes
@@ -443,8 +444,10 @@ def to_pandas(
443444
df.set_axis(self.column_labels, axis=1, copy=False)
444445
return df, query_job
445446

446-
def try_peek(self, n: int = 20) -> typing.Optional[pd.DataFrame]:
447-
if self.expr.node.peekable:
447+
def try_peek(
448+
self, n: int = 20, force: bool = False
449+
) -> typing.Optional[pd.DataFrame]:
450+
if force or tree_properties.peekable(self.expr.node):
448451
iterator, _ = self.session._peek(self.expr, n)
449452
df = self._to_dataframe(iterator)
450453
self._copy_index_to_pandas(df)

bigframes/core/nodes.py

+1-36
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,6 @@ def session(self):
9090
def _node_hash(self):
9191
return hash(tuple(hash(getattr(self, field.name)) for field in fields(self)))
9292

93-
@property
94-
def peekable(self) -> bool:
95-
"""Indicates whether the node can be sampled efficiently"""
96-
return all(child.peekable for child in self.child_nodes)
97-
9893
@property
9994
def roots(self) -> typing.Set[BigFrameNode]:
10095
roots = itertools.chain.from_iterable(
@@ -143,12 +138,6 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
143138
def __hash__(self):
144139
return self._node_hash
145140

146-
@property
147-
def peekable(self) -> bool:
148-
children_peekable = all(child.peekable for child in self.child_nodes)
149-
single_root = len(self.roots) == 1
150-
return children_peekable and single_root
151-
152141
@functools.cached_property
153142
def schema(self) -> schemata.ArraySchema:
154143
def join_mapping_to_schema_item(mapping: JoinColumnMapping):
@@ -204,10 +193,6 @@ class ReadLocalNode(BigFrameNode):
204193
def __hash__(self):
205194
return self._node_hash
206195

207-
@property
208-
def peekable(self) -> bool:
209-
return True
210-
211196
@property
212197
def roots(self) -> typing.Set[BigFrameNode]:
213198
return {self}
@@ -233,10 +218,6 @@ def session(self):
233218
def __hash__(self):
234219
return self._node_hash
235220

236-
@property
237-
def peekable(self) -> bool:
238-
return True
239-
240221
@property
241222
def roots(self) -> typing.Set[BigFrameNode]:
242223
return {self}
@@ -261,13 +242,9 @@ class PromoteOffsetsNode(UnaryNode):
261242
def __hash__(self):
262243
return self._node_hash
263244

264-
@property
265-
def peekable(self) -> bool:
266-
return False
267-
268245
@property
269246
def non_local(self) -> bool:
270-
return False
247+
return True
271248

272249
@property
273250
def schema(self) -> schemata.ArraySchema:
@@ -371,10 +348,6 @@ def row_preserving(self) -> bool:
371348
def __hash__(self):
372349
return self._node_hash
373350

374-
@property
375-
def peekable(self) -> bool:
376-
return False
377-
378351
@property
379352
def non_local(self) -> bool:
380353
return True
@@ -407,10 +380,6 @@ class WindowOpNode(UnaryNode):
407380
def __hash__(self):
408381
return self._node_hash
409382

410-
@property
411-
def peekable(self) -> bool:
412-
return False
413-
414383
@property
415384
def non_local(self) -> bool:
416385
return True
@@ -459,10 +428,6 @@ def row_preserving(self) -> bool:
459428
def non_local(self) -> bool:
460429
return True
461430

462-
@property
463-
def peekable(self) -> bool:
464-
return False
465-
466431
@functools.cached_property
467432
def schema(self) -> schemata.ArraySchema:
468433
def infer_dtype(

bigframes/core/traversal.py renamed to bigframes/core/tree_properties.py

+11
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
1516
import bigframes.core.nodes as nodes
1617

18+
# TODO: Convert these functions to iterative or enforce hard limit on tree depth. The below algorithms can cause stack to exceed limit.
19+
1720

1821
def is_trivially_executable(node: nodes.BigFrameNode) -> bool:
1922
if local_only(node):
@@ -25,3 +28,11 @@ def is_trivially_executable(node: nodes.BigFrameNode) -> bool:
2528

2629
def local_only(node: nodes.BigFrameNode) -> bool:
2730
return all(isinstance(node, nodes.ReadLocalNode) for node in node.roots)
31+
32+
33+
def peekable(node: nodes.BigFrameNode) -> bool:
34+
if local_only(node):
35+
return True
36+
children_peekable = all(peekable(child) for child in node.child_nodes)
37+
self_peekable = not node.non_local
38+
return children_peekable and self_peekable

bigframes/dataframe.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1111,7 +1111,7 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
11111111
if maybe_result is None:
11121112
if force:
11131113
self._cached()
1114-
maybe_result = self._block.try_peek(n)
1114+
maybe_result = self._block.try_peek(n, force=True)
11151115
assert maybe_result is not None
11161116
else:
11171117
raise ValueError(

bigframes/session/__init__.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@
8282
import bigframes.core.guid as guid
8383
from bigframes.core.ordering import IntegerEncoding
8484
import bigframes.core.ordering as order
85-
import bigframes.core.traversal as traversals
85+
import bigframes.core.tree_properties as traversals
86+
import bigframes.core.tree_properties as tree_properties
8687
import bigframes.core.utils as utils
8788
import bigframes.dtypes
8889
import bigframes.formatting_helpers as formatting_helpers
@@ -1848,8 +1849,8 @@ def _peek(
18481849
self, array_value: core.ArrayValue, n_rows: int
18491850
) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
18501851
"""A 'peek' efficiently accesses a small number of rows in the dataframe."""
1851-
if not array_value.node.peekable:
1852-
raise NotImplementedError("cannot efficient peek this dataframe")
1852+
if not tree_properties.peekable(array_value.node):
1853+
warnings.warn("Peeking this value cannot be done efficiently.")
18531854
sql = self._compile_unordered(array_value).peek_sql(n_rows)
18541855
return self._start_query(
18551856
sql=sql,

tests/system/small/test_dataframe.py

+11
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,17 @@ def test_df_peek_force_default(scalars_dfs):
494494
assert len(peek_result) == 3
495495

496496

497+
def test_df_peek_reset_index(scalars_dfs):
498+
scalars_df, scalars_pandas_df = scalars_dfs
499+
peek_result = (
500+
scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3)
501+
)
502+
pd.testing.assert_index_equal(
503+
scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns
504+
)
505+
assert len(peek_result) == 3
506+
507+
497508
def test_repr_w_all_rows(scalars_dfs):
498509
scalars_df, scalars_pandas_df = scalars_dfs
499510

0 commit comments

Comments
 (0)