Skip to content

Commit 3a2761f

Browse files
fix: all sort operation are now stable (#195)
1 parent 4a02cac commit 3a2761f

File tree

10 files changed

+32
-55
lines changed

10 files changed

+32
-55
lines changed

bigframes/core/__init__.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,8 @@ def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue:
200200
)
201201
)
202202

203-
def order_by(
204-
self, by: Sequence[OrderingColumnReference], stable: bool = False
205-
) -> ArrayValue:
206-
return ArrayValue(
207-
nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable)
208-
)
203+
def order_by(self, by: Sequence[OrderingColumnReference]) -> ArrayValue:
204+
return ArrayValue(nodes.OrderByNode(child=self.node, by=tuple(by)))
209205

210206
def reversed(self) -> ArrayValue:
211207
return ArrayValue(nodes.ReversedNode(child=self.node))

bigframes/core/block_transforms.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ def nsmallest(
509509
)
510510
for col_id in column_ids
511511
]
512-
block = block.order_by(order_refs, stable=True)
512+
block = block.order_by(order_refs)
513513
if keep in ("first", "last"):
514514
return block.slice(0, n)
515515
else: # keep == "all":
@@ -541,7 +541,7 @@ def nlargest(
541541
)
542542
for col_id in column_ids
543543
]
544-
block = block.order_by(order_refs, stable=True)
544+
block = block.order_by(order_refs)
545545
if keep in ("first", "last"):
546546
return block.slice(0, n)
547547
else: # keep == "all":

bigframes/core/blocks.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,9 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
235235
def order_by(
236236
self,
237237
by: typing.Sequence[ordering.OrderingColumnReference],
238-
stable: bool = False,
239238
) -> Block:
240239
return Block(
241-
self._expr.order_by(by, stable=stable),
240+
self._expr.order_by(by),
242241
index_columns=self.index_columns,
243242
column_labels=self.column_labels,
244243
index_labels=self.index.names,
@@ -1596,7 +1595,6 @@ def merge(
15961595
# sort uses coalesced join keys always
15971596
joined_expr = joined_expr.order_by(
15981597
[ordering.OrderingColumnReference(col_id) for col_id in coalesced_ids],
1599-
stable=True,
16001598
)
16011599

16021600
joined_expr = joined_expr.select_columns(result_columns)

bigframes/core/compile/compiled.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -753,11 +753,9 @@ def builder(self) -> OrderedIR.Builder:
753753
predicates=self._predicates,
754754
)
755755

756-
def order_by(
757-
self, by: Sequence[OrderingColumnReference], stable: bool = False
758-
) -> OrderedIR:
756+
def order_by(self, by: Sequence[OrderingColumnReference]) -> OrderedIR:
759757
expr_builder = self.builder()
760-
expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable)
758+
expr_builder.ordering = self._ordering.with_ordering_columns(by)
761759
return expr_builder.build()
762760

763761
def reversed(self) -> OrderedIR:

bigframes/core/compile/compiler.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def compile_filter(node: nodes.FilterNode, ordered: bool = True):
129129
@_compile_node.register
130130
def compile_orderby(node: nodes.OrderByNode, ordered: bool = True):
131131
if ordered:
132-
return compile_ordered(node.child).order_by(node.by, node.stable)
132+
return compile_ordered(node.child).order_by(node.by)
133133
else:
134134
return compile_unordered(node.child)
135135

bigframes/core/groupby/__init__.py

-4
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
217217
)
218218
block = self._block.order_by(
219219
[order.OrderingColumnReference(col) for col in self._by_col_ids],
220-
stable=True,
221220
)
222221
return windows.Window(
223222
block, window_spec, self._selected_cols, drop_null_groups=self._dropna
@@ -231,7 +230,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window:
231230
)
232231
block = self._block.order_by(
233232
[order.OrderingColumnReference(col) for col in self._by_col_ids],
234-
stable=True,
235233
)
236234
return windows.Window(
237235
block, window_spec, self._selected_cols, drop_null_groups=self._dropna
@@ -552,7 +550,6 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
552550
)
553551
block = self._block.order_by(
554552
[order.OrderingColumnReference(col) for col in self._by_col_ids],
555-
stable=True,
556553
)
557554
return windows.Window(
558555
block,
@@ -570,7 +567,6 @@ def expanding(self, min_periods: int = 1) -> windows.Window:
570567
)
571568
block = self._block.order_by(
572569
[order.OrderingColumnReference(col) for col in self._by_col_ids],
573-
stable=True,
574570
)
575571
return windows.Window(
576572
block,

bigframes/core/nodes.py

-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ class FilterNode(UnaryNode):
145145
@dataclass(frozen=True)
146146
class OrderByNode(UnaryNode):
147147
by: Tuple[OrderingColumnReference, ...]
148-
stable: bool = False
149148

150149

151150
@dataclass(frozen=True)

bigframes/core/ordering.py

+22-25
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
# Sufficient to store any value up to 2^63
2929
DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE))
3030

31-
STABLE_SORTS = ["mergesort", "stable"]
32-
3331

3432
class OrderingDirection(Enum):
3533
ASC = 1
@@ -113,47 +111,46 @@ def with_non_sequential(self):
113111
def with_ordering_columns(
114112
self,
115113
ordering_value_columns: Sequence[OrderingColumnReference] = (),
116-
stable: bool = False,
117114
) -> ExpressionOrdering:
118115
"""Creates a new ordering that reorders by the given columns.
119116
120117
Args:
121118
ordering_value_columns:
122119
In decreasing precedence order, the values used to sort the ordering
123-
stable:
124-
If True, will use apply a stable sorting, using the old ordering where
125-
the new ordering produces ties. Otherwise, ties will be resolved in
126-
a performance maximizing way,
127120
128121
Returns:
129122
Modified ExpressionOrdering
130123
"""
131124
col_ids_new = [
132125
ordering_ref.column_id for ordering_ref in ordering_value_columns
133126
]
134-
if stable:
135-
# Only reference each column once, so discard old referenc if there is a new reference
136-
old_ordering_keep = [
137-
ordering_ref
138-
for ordering_ref in self.ordering_value_columns
139-
if ordering_ref.column_id not in col_ids_new
140-
]
141-
else:
142-
# New ordering needs to keep all total ordering columns no matter what.
143-
# All other old ordering references can be discarded as does not need
144-
# to be a stable sort.
145-
old_ordering_keep = [
146-
ordering_ref
147-
for ordering_ref in self.ordering_value_columns
148-
if (ordering_ref.column_id not in col_ids_new)
149-
and (ordering_ref.column_id in self.total_ordering_columns)
150-
]
151-
new_ordering = (*ordering_value_columns, *old_ordering_keep)
127+
old_ordering_keep = [
128+
ordering_ref
129+
for ordering_ref in self.ordering_value_columns
130+
if ordering_ref.column_id not in col_ids_new
131+
]
132+
133+
# Truncate to remove any unneded col references after all total order cols included
134+
new_ordering = self._truncate_ordering(
135+
(*ordering_value_columns, *old_ordering_keep)
136+
)
152137
return ExpressionOrdering(
153138
new_ordering,
154139
total_ordering_columns=self.total_ordering_columns,
155140
)
156141

142+
def _truncate_ordering(
143+
self, order_refs: tuple[OrderingColumnReference, ...]
144+
) -> tuple[OrderingColumnReference, ...]:
145+
total_order_cols_remaining = set(self.total_ordering_columns)
146+
for i in range(len(order_refs)):
147+
column = order_refs[i].column_id
148+
if column in total_order_cols_remaining:
149+
total_order_cols_remaining.remove(column)
150+
if len(total_order_cols_remaining) == 0:
151+
return order_refs[: i + 1]
152+
raise ValueError("Ordering did not contain all total_order_cols")
153+
157154
def with_reverse(self):
158155
"""Reverses the ordering."""
159156
return ExpressionOrdering(

bigframes/dataframe.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -1262,9 +1262,7 @@ def sort_values(
12621262
column_id, direction=direction, na_last=na_last
12631263
)
12641264
)
1265-
return DataFrame(
1266-
self._block.order_by(ordering, stable=kind in order.STABLE_SORTS)
1267-
)
1265+
return DataFrame(self._block.order_by(ordering))
12681266

12691267
def value_counts(
12701268
self,

bigframes/series.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,7 @@
3535
import bigframes.core.groupby as groupby
3636
import bigframes.core.indexers
3737
import bigframes.core.indexes as indexes
38-
from bigframes.core.ordering import (
39-
OrderingColumnReference,
40-
OrderingDirection,
41-
STABLE_SORTS,
42-
)
38+
from bigframes.core.ordering import OrderingColumnReference, OrderingDirection
4339
import bigframes.core.scalar as scalars
4440
import bigframes.core.utils as utils
4541
import bigframes.core.window
@@ -1067,7 +1063,6 @@ def sort_values(
10671063
na_last=(na_position == "last"),
10681064
)
10691065
],
1070-
stable=kind in STABLE_SORTS,
10711066
)
10721067
return Series(block)
10731068

0 commit comments

Comments
 (0)