Skip to content

Commit e764f20

Browse files
committed
specialize reassociate to do ilp
1 parent 1eb7a04 commit e764f20

File tree

2 files changed

+21
-71
lines changed

2 files changed

+21
-71
lines changed

src/cmd/compile/internal/ssa/compile.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,6 @@ var passes = [...]pass{
465465
{name: "pre-opt deadcode", fn: deadcode},
466466
{name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules
467467
{name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values
468-
{name: "reassociate", fn: reassociate},
469468
{name: "opt deadcode", fn: deadcode, required: true}, // remove any blocks orphaned during opt
470469
{name: "generic cse", fn: cse},
471470
{name: "phiopt", fn: phiopt},
@@ -484,6 +483,7 @@ var passes = [...]pass{
484483
{name: "late fuse", fn: fuseLate},
485484
{name: "dse", fn: dse},
486485
{name: "memcombine", fn: memcombine},
486+
{name: "ilp", fn: ilp},
487487
{name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops
488488
{name: "insert resched checks", fn: insertLoopReschedChecks,
489489
disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops.
@@ -586,6 +586,8 @@ var passOrder = [...]constraint{
586586
{"late fuse", "memcombine"},
587587
// memcombine is a arch-independent pass.
588588
{"memcombine", "lower"},
589+
// ilp works best after ORs have been combined to loads
590+
{"memcombine", "ilp"},
589591
}
590592

591593
func init() {

src/cmd/compile/internal/ssa/reassociate.go renamed to src/cmd/compile/internal/ssa/ilp.go

Lines changed: 18 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44

55
package ssa
66

7-
// reassociate balances trees of commutative computation
8-
// to better group expressions to expose easy optimizations in
9-
// cse, cancelling/counting/factoring expressions, etc.
10-
func reassociate(f *Func) {
7+
// ilp pass (Instruction Level Parallelism) balances trees of commutative computation
8+
// to help CPU pipeline instructions more efficiently. It only works block by block
9+
// so that it doesn't end up pulling loop invariant expressions into tight loops
10+
func ilp(f *Func) {
1111
visited := f.newSparseSet(f.NumValues())
1212

1313
for _, b := range f.Postorder() {
@@ -22,20 +22,10 @@ func reassociate(f *Func) {
2222
// It doesn't truly balance the tree in the sense of a BST, rather it
2323
// prioritizes pairing up innermost (rightmost) expressions and their results and only
2424
// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists
25-
func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) {
26-
// reset all arguments of nodes to help rebalancing
27-
for i, n := range nodes {
25+
func balanceExprTree(nodes, leaves []*Value) {
26+
// reset all arguments of nodes to reuse them
27+
for _, n := range nodes {
2828
n.reset(n.Op)
29-
30-
// sometimes nodes in the tree are in different blocks
31-
// so pull them in into a common block (v's block)
32-
// to make sure nodes don't end up dominating their leaves TODO(ryan-berger), not necessary
33-
if v.Block != n.Block {
34-
copied := n.copyInto(v.Block)
35-
n.Op = OpInvalid
36-
visited.add(copied.ID) // "revisit" the copied node
37-
nodes[i] = copied
38-
}
3929
}
4030

4131
// we bfs'ed through the nodes in reverse topological order
@@ -60,7 +50,7 @@ func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) {
6050
}
6151

6252
for j := start; j < len(subTrees)-1; j += 2 {
63-
nodes[i].AddArg2(subTrees[j], subTrees[j+1])
53+
nodes[i].AddArgs(subTrees[j], subTrees[j+1])
6454
nextSubTrees = append(nextSubTrees, nodes[i])
6555
i++
6656
}
@@ -69,60 +59,20 @@ func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) {
6959
}
7060
}
7161

72-
func isOr(op Op) bool {
73-
switch op {
74-
case OpOr8, OpOr16, OpOr32, OpOr64:
75-
return true
76-
default:
77-
return false
78-
}
79-
}
80-
81-
// probablyMemcombine helps find a pattern of leaves that form
82-
// a load that can be widened which looks like:
83-
//
84-
// (l | l << 8 | l << 18 | l << 24)
85-
//
86-
// which cannot be rebalanced or else it won't fire load widening rewrite rules
87-
func probablyMemcombine(op Op, leaves []*Value) bool {
88-
if !isOr(op) {
89-
return false
90-
}
91-
92-
lshCount := 0
93-
for _, l := range leaves {
94-
switch l.Op {
95-
case OpLsh8x8, OpLsh8x16, OpLsh8x32, OpLsh8x64,
96-
OpLsh16x8, OpLsh16x16, OpLsh16x32, OpLsh16x64,
97-
OpLsh32x8, OpLsh32x16, OpLsh32x32, OpLsh32x64,
98-
OpLsh64x8, OpLsh64x16, OpLsh64x32, OpLsh64x64:
99-
lshCount++
100-
}
101-
}
102-
103-
// there are a few algorithms in the std lib expressed as two 32 bit loads
104-
// which can get turned into a 64 bit load
105-
// conservatively estimate that if there are more shifts than not then it is
106-
// some sort of load waiting to be widened
107-
return lshCount > len(leaves)/2
108-
}
109-
11062
// rebalance balances associative computation to better help CPU instruction pipelining (#49331)
111-
// and groups constants together catch more constant folding opportunities.
11263
//
11364
// a + b + c + d compiles to to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree
11465
// Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in
11566
// v2, and v2 before its use in v1
11667
//
11768
// This optimization rebalances this expression tree to look like (a + b) + (c + d) ,
11869
// which removes such dependencies and frees up the CPU pipeline.
119-
//
120-
// The above optimization is also a good starting point for other sorts of operations such as
121-
// turning a + a + a => 3*a, cancelling pairs a + (-a), collecting up common factors TODO(ryan-berger)
12270
func rebalance(v *Value, visited *sparseSet) {
123-
// We cannot apply this optimization to non-commutative operations,
71+
// We cannot apply this optimization to non-commutative operations.
72+
// We also exclude 3+ arg ops because there are 0 opportunities in the std lib,
73+
// and the benefit for maintenance cost is not currently worth it.
12474
// Try and save time by not revisiting nodes
125-
if visited.contains(v.ID) || !opcodeTable[v.Op].commutative {
75+
if visited.contains(v.ID) || !opcodeTable[v.Op].commutative || len(v.Args) > 2{
12676
return
12777
}
12878

@@ -142,26 +92,24 @@ func rebalance(v *Value, visited *sparseSet) {
14292
visited.add(v.ID)
14393

14494
for _, a := range needle.Args {
145-
// If the ops aren't the same or have more than one use it must be a leaf.
146-
if a.Op != v.Op || a.Uses != 1 {
95+
// If the ops aren't the same, have more than one use, or not in the same BB it must be a leaf.
96+
if a.Op != v.Op || a.Uses != 1 || a.Block != v.Block {
14797
leaves = append(leaves, a)
14898
continue
14999
}
150100

151101
// nodes in the tree now hold the invariants that:
152102
// - they are of a common associative operation as the rest of the tree
153-
// - they have only a single use (this invariant could be removed with further analysis TODO(ryan-berger))
103+
// - they have only a single use
104+
// - they are in the same basic block
154105
haystack = append(haystack, a)
155106
}
156107
}
157108

158-
minLeaves := len(v.Args) * len(v.Args)
159-
160109
// we need at least args^2 leaves for this expression to be rebalanceable,
161-
// and we can't balance a potential load widening (see memcombine)
162-
if len(leaves) < minLeaves || probablyMemcombine(v.Op, leaves) {
110+
if len(leaves) < 4 {
163111
return
164112
}
165113

166-
balanceExprTree(v, visited, nodes, leaves)
114+
balanceExprTree(nodes, leaves)
167115
}

0 commit comments

Comments
 (0)