fix: Fix __repr__ caching with partial ordering (#1016)

TrevorBergeron · web-flow · commit 208a98475389 · 2024-09-24T13:32:17.000-05:00
diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py
@@ -44,8 +44,8 @@ def can_fast_head(node: nodes.BigFrameNode) -> bool:
     """Can get head fast if can push head operator down to leafs and operators preserve rows."""
     if isinstance(node, nodes.LeafNode):
         return node.supports_fast_head
-    if isinstance(node, nodes.UnaryNode):
-        return node.row_preserving and can_fast_head(node.child)
+    if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)):
+        return can_fast_head(node.child)
     return False
 
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -643,7 +643,6 @@ def __repr__(self) -> str:
         if opts.repr_mode == "deferred":
             return formatter.repr_query_job(self._compute_dry_run())
 
-        self._cached()
         # TODO(swast): pass max_columns and get the true column count back. Maybe
         # get 1 more column than we have requested so that pandas can add the
         # ... for us?
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -360,11 +360,6 @@ def _cache_with_cluster_cols(
 
     def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue):
         """Executes the query and uses the resulting table to rewrite future executions."""
-
-        if not self.strictly_ordered:
-            raise ValueError(
-                "Caching with offsets only supported in strictly ordered mode."
-            )
         offset_column = bigframes.core.guid.generate_guid("bigframes_offsets")
         w_offsets, offset_column = array_value.promote_offsets()
         sql = self.compiler.compile_unordered(self._get_optimized_plan(w_offsets.node))
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -154,9 +154,9 @@ def session_load() -> Generator[bigframes.Session, None, None]:
     session.close()  # close generated session at cleanup time
 
 
-@pytest.fixture(scope="session", params=["ordered", "unordered"])
+@pytest.fixture(scope="session", params=["strict", "partial"])
 def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]:
-    context = bigframes.BigQueryOptions(location="US", ordering_mode="partial")
+    context = bigframes.BigQueryOptions(location="US", ordering_mode=request.param)
     session = bigframes.Session(context=context)
     yield session
     session.close()  # close generated session at cleanup type
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -567,6 +567,30 @@ def test_repr_w_all_rows(scalars_dfs):
     assert actual == expected
 
 
+def test_join_repr(scalars_dfs_maybe_ordered):
+    scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered
+
+    scalars_df = (
+        scalars_df[["int64_col"]]
+        .join(scalars_df.set_index("int64_col")[["int64_too"]])
+        .sort_index()
+    )
+    scalars_pandas_df = (
+        scalars_pandas_df[["int64_col"]]
+        .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]])
+        .sort_index()
+    )
+    # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly
+    scalars_pandas_df.index.name = None
+
+    actual = repr(scalars_df)
+
+    with display_options.pandas_repr(bigframes.options.display):
+        expected = repr(scalars_pandas_df)
+
+    assert actual == expected
+
+
 def test_repr_html_w_all_rows(scalars_dfs):
     scalars_df, _ = scalars_dfs
     # get a pandas df of the expected format