4
4
5
5
package ssa
6
6
7
- // reassociate balances trees of commutative computation
8
- // to better group expressions to expose easy optimizations in
9
- // cse, cancelling/counting/factoring expressions, etc.
10
- func reassociate (f * Func ) {
7
+ // ilp pass (Instruction Level Parallelism) balances trees of commutative computation
8
+ // to help CPU pipeline instructions more efficiently. It only works block by block
9
+ // so that it doesn't end up pulling loop invariant expressions into tight loops
10
+ func ilp (f * Func ) {
11
11
visited := f .newSparseSet (f .NumValues ())
12
12
13
13
for _ , b := range f .Postorder () {
@@ -22,20 +22,10 @@ func reassociate(f *Func) {
22
22
// It doesn't truly balance the tree in the sense of a BST, rather it
23
23
// prioritizes pairing up innermost (rightmost) expressions and their results and only
24
24
// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists
25
- func balanceExprTree (v * Value , visited * sparseSet , nodes , leaves []* Value ) {
26
- // reset all arguments of nodes to help rebalancing
27
- for i , n := range nodes {
25
+ func balanceExprTree (nodes , leaves []* Value ) {
26
+ // reset all arguments of nodes to reuse them
27
+ for _ , n := range nodes {
28
28
n .reset (n .Op )
29
-
30
- // sometimes nodes in the tree are in different blocks
31
- // so pull them in into a common block (v's block)
32
- // to make sure nodes don't end up dominating their leaves TODO(ryan-berger), not necessary
33
- if v .Block != n .Block {
34
- copied := n .copyInto (v .Block )
35
- n .Op = OpInvalid
36
- visited .add (copied .ID ) // "revisit" the copied node
37
- nodes [i ] = copied
38
- }
39
29
}
40
30
41
31
// we bfs'ed through the nodes in reverse topological order
@@ -60,7 +50,7 @@ func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) {
60
50
}
61
51
62
52
for j := start ; j < len (subTrees )- 1 ; j += 2 {
63
- nodes [i ].AddArg2 (subTrees [j ], subTrees [j + 1 ])
53
+ nodes [i ].AddArgs (subTrees [j ], subTrees [j + 1 ])
64
54
nextSubTrees = append (nextSubTrees , nodes [i ])
65
55
i ++
66
56
}
@@ -69,60 +59,20 @@ func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) {
69
59
}
70
60
}
71
61
72
- func isOr (op Op ) bool {
73
- switch op {
74
- case OpOr8 , OpOr16 , OpOr32 , OpOr64 :
75
- return true
76
- default :
77
- return false
78
- }
79
- }
80
-
81
- // probablyMemcombine helps find a pattern of leaves that form
82
- // a load that can be widened which looks like:
83
- //
84
- // (l | l << 8 | l << 18 | l << 24)
85
- //
86
- // which cannot be rebalanced or else it won't fire load widening rewrite rules
87
- func probablyMemcombine (op Op , leaves []* Value ) bool {
88
- if ! isOr (op ) {
89
- return false
90
- }
91
-
92
- lshCount := 0
93
- for _ , l := range leaves {
94
- switch l .Op {
95
- case OpLsh8x8 , OpLsh8x16 , OpLsh8x32 , OpLsh8x64 ,
96
- OpLsh16x8 , OpLsh16x16 , OpLsh16x32 , OpLsh16x64 ,
97
- OpLsh32x8 , OpLsh32x16 , OpLsh32x32 , OpLsh32x64 ,
98
- OpLsh64x8 , OpLsh64x16 , OpLsh64x32 , OpLsh64x64 :
99
- lshCount ++
100
- }
101
- }
102
-
103
- // there are a few algorithms in the std lib expressed as two 32 bit loads
104
- // which can get turned into a 64 bit load
105
- // conservatively estimate that if there are more shifts than not then it is
106
- // some sort of load waiting to be widened
107
- return lshCount > len (leaves )/ 2
108
- }
109
-
110
62
// rebalance balances associative computation to better help CPU instruction pipelining (#49331)
111
- // and groups constants together catch more constant folding opportunities.
112
63
//
113
64
// a + b + c + d compiles to to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree
114
65
// Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in
115
66
// v2, and v2 before its use in v1
116
67
//
117
68
// This optimization rebalances this expression tree to look like (a + b) + (c + d) ,
118
69
// which removes such dependencies and frees up the CPU pipeline.
119
- //
120
- // The above optimization is also a good starting point for other sorts of operations such as
121
- // turning a + a + a => 3*a, cancelling pairs a + (-a), collecting up common factors TODO(ryan-berger)
122
70
func rebalance (v * Value , visited * sparseSet ) {
123
- // We cannot apply this optimization to non-commutative operations,
71
+ // We cannot apply this optimization to non-commutative operations.
72
+ // We also exclude 3+ arg ops because there are 0 opportunities in the std lib,
73
+ // and the benefit for maintenance cost is not currently worth it.
124
74
// Try and save time by not revisiting nodes
125
- if visited .contains (v .ID ) || ! opcodeTable [v .Op ].commutative {
75
+ if visited .contains (v .ID ) || ! opcodeTable [v .Op ].commutative || len ( v . Args ) > 2 {
126
76
return
127
77
}
128
78
@@ -142,26 +92,24 @@ func rebalance(v *Value, visited *sparseSet) {
142
92
visited .add (v .ID )
143
93
144
94
for _ , a := range needle .Args {
145
- // If the ops aren't the same or have more than one use it must be a leaf.
146
- if a .Op != v .Op || a .Uses != 1 {
95
+ // If the ops aren't the same, have more than one use, or not in the same BB it must be a leaf.
96
+ if a .Op != v .Op || a .Uses != 1 || a . Block != v . Block {
147
97
leaves = append (leaves , a )
148
98
continue
149
99
}
150
100
151
101
// nodes in the tree now hold the invariants that:
152
102
// - they are of a common associative operation as the rest of the tree
153
- // - they have only a single use (this invariant could be removed with further analysis TODO(ryan-berger))
103
+ // - they have only a single use
104
+ // - they are in the same basic block
154
105
haystack = append (haystack , a )
155
106
}
156
107
}
157
108
158
- minLeaves := len (v .Args ) * len (v .Args )
159
-
160
109
// we need at least args^2 leaves for this expression to be rebalanceable,
161
- // and we can't balance a potential load widening (see memcombine)
162
- if len (leaves ) < minLeaves || probablyMemcombine (v .Op , leaves ) {
110
+ if len (leaves ) < 4 {
163
111
return
164
112
}
165
113
166
- balanceExprTree (v , visited , nodes , leaves )
114
+ balanceExprTree (nodes , leaves )
167
115
}
0 commit comments