Improve RP code generation for negation.

Previously, we emitted `0 - x` for all negation. This was fine, but can never use our immediate ops. Now, we emit negation for floats as `x + int(0x80000000)`, and negation for ints as `x * -1`. This allows scalar negation to use a single immediate-op instruction, and sometimes allows multi-slot negation to reuse an existing value on the stack. This never seems to be a downgrade. Change-Id: I427f6e2932370c56cd7076535e082d938a645820 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/681516 Reviewed-by: Brian Osman <[email protected]> Commit-Queue: Brian Osman <[email protected]> Auto-Submit: John Stiles <[email protected]> Commit-Queue: John Stiles <[email protected]>
armansito · Apr 26, 2023 · 3fea885 · 3fea885
1 parent b59f0fd
commit 3fea885
Show file tree

Hide file tree

Showing 12 changed files with 271 additions and 283 deletions.
diff --git a/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.cpp b/src/sksl/codegen/SkSLRasterPipelineCodeGenerator.cpp
@@ -3367,23 +3367,30 @@ bool Generator::pushPrefixExpression(Operator op, const Expression& expr) {
             fBuilder.binary_op(BuilderOp::bitwise_xor_n_ints, expr.type().slotCount());
             return true;
 
-        case OperatorKind::MINUS:
-            // Handle negation as a componentwise `0 - expr`.
-            fBuilder.push_zeros(expr.type().slotCount());
+        case OperatorKind::MINUS: {
             if (!this->pushExpression(expr)) {
                 return unsupported();
             }
-            return this->binaryOp(expr.type(), kSubtractOps);
-
+            if (expr.type().componentType().isFloat()) {
+                // Handle float negation as an integer `x ^ 0x80000000`. This toggles the sign bit.
+                fBuilder.push_constant_u(0x80000000, expr.type().slotCount());
+                fBuilder.binary_op(BuilderOp::bitwise_xor_n_ints, expr.type().slotCount());
+            } else {
+                // Handle integer negation as a componentwise `expr * -1`.
+                fBuilder.push_constant_i(-1, expr.type().slotCount());
+                fBuilder.binary_op(BuilderOp::mul_n_ints, expr.type().slotCount());
+            }
+            return true;
+        }
         case OperatorKind::PLUSPLUS: {
             // Rewrite as `expr += 1`.
             Literal oneLiteral{Position{}, 1.0, &expr.type().componentType()};
             return this->pushBinaryExpression(expr, OperatorKind::PLUSEQ, oneLiteral);
         }
         case OperatorKind::MINUSMINUS: {
-            // Rewrite as `expr -= 1`.
-            Literal oneLiteral{Position{}, 1.0, &expr.type().componentType()};
-            return this->pushBinaryExpression(expr, OperatorKind::MINUSEQ, oneLiteral);
+            // Rewrite as `expr += -1`.
+            Literal minusOneLiteral{expr.fPosition, -1.0, &expr.type().componentType()};
+            return this->pushBinaryExpression(expr, OperatorKind::PLUSEQ, minusOneLiteral);
         }
         default:
             break;

diff --git a/tests/sksl/folding/MatrixNoOpFolding.skrp b/tests/sksl/folding/MatrixNoOpFolding.skrp
@@ -9,9 +9,8 @@ copy_4_uniforms                $0..3 = testMatrix2x2
 copy_4_slots_unmasked          _0_m = $0..3
 copy_4_uniforms                $0..3 = testMatrix2x2
 copy_4_slots_unmasked          _0_m = $0..3
-splat_4_constants              $0..3 = 0
-copy_4_slots_unmasked          $4..7 = _0_m
-sub_4_floats                   $0..3 -= $4..7
+splat_4_constants              $4..7 = 0x80000000 (-0.0)
+bitwise_xor_4_ints             $0..3 ^= $4..7
 copy_4_slots_unmasked          _0_m = $0..3
 splat_2_constants              $0..1 = 0
 swizzle_4                      $0..3 = ($0..3).yxxy
@@ -22,9 +21,9 @@ copy_4_slots_unmasked          _1_mm = $0..3
 store_condition_mask           $49 = CondMask
 store_condition_mask           $78 = CondMask
 copy_4_slots_unmasked          $79..82 = _0_m
-splat_4_constants              $83..86 = 0
-copy_4_uniforms                $87..90 = testMatrix2x2
-sub_4_floats                   $83..86 -= $87..90
+copy_4_uniforms                $83..86 = testMatrix2x2
+splat_4_constants              $87..90 = 0x80000000 (-0.0)
+bitwise_xor_4_ints             $83..86 ^= $87..90
 cmpeq_4_floats                 $79..82 = equal($79..82, $83..86)
 bitwise_and_2_ints             $79..80 &= $81..82
 bitwise_and_int                $79 &= $80
@@ -36,7 +35,7 @@ bitwise_and_int                $80 &= $81
 bitwise_and_int                $79 &= $80
 copy_constant                  $50 = 0
 merge_condition_mask           CondMask = $78 & $79
-branch_if_no_lanes_active      branch_if_no_lanes_active +71 (label 2 at #110)
+branch_if_no_lanes_active      branch_if_no_lanes_active +68 (label 2 at #106)
 splat_4_constants              m(0..3) = 0
 splat_4_constants              m(4..7) = 0
 splat_4_constants              m(8), mm(0..2) = 0
@@ -59,13 +58,10 @@ copy_uniform                   $59 = testMatrix3x3(8)
 copy_4_slots_masked            m(0..3) = Mask($51..54)
 copy_4_slots_masked            m(4..7) = Mask($55..58)
 copy_slot_masked               m(8) = Mask($59)
-splat_4_constants              $51..54 = 0
-splat_4_constants              $55..58 = 0
-copy_constant                  $59 = 0
-copy_4_slots_unmasked          $60..63 = m(0..3)
-copy_4_slots_unmasked          $64..67 = m(4..7)
-copy_slot_unmasked             $68 = m(8)
-sub_n_floats                   $51..59 -= $60..68
+splat_4_constants              $60..63 = 0x80000000 (-0.0)
+splat_4_constants              $64..67 = 0x80000000 (-0.0)
+copy_constant                  $68 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $51..59 ^= $60..68
 copy_4_slots_masked            m(0..3) = Mask($51..54)
 copy_4_slots_masked            m(4..7) = Mask($55..58)
 copy_slot_masked               m(8) = Mask($59)
@@ -82,13 +78,13 @@ copy_slot_masked               mm(8) = Mask($59)
 copy_4_slots_unmasked          $51..54 = m(0..3)
 copy_4_slots_unmasked          $55..58 = m(4..7)
 copy_slot_unmasked             $59 = m(8)
-splat_4_constants              $60..63 = 0
-splat_4_constants              $64..67 = 0
-copy_constant                  $68 = 0
-copy_4_uniforms                $69..72 = testMatrix3x3(0..3)
-copy_4_uniforms                $73..76 = testMatrix3x3(4..7)
-copy_uniform                   $77 = testMatrix3x3(8)
-sub_n_floats                   $60..68 -= $69..77
+copy_4_uniforms                $60..63 = testMatrix3x3(0..3)
+copy_4_uniforms                $64..67 = testMatrix3x3(4..7)
+copy_uniform                   $68 = testMatrix3x3(8)
+splat_4_constants              $69..72 = 0x80000000 (-0.0)
+splat_4_constants              $73..76 = 0x80000000 (-0.0)
+copy_constant                  $77 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $60..68 ^= $69..77
 cmpeq_n_floats                 $51..59 = equal($51..59, $60..68)
 bitwise_and_4_ints             $52..55 &= $56..59
 bitwise_and_2_ints             $52..53 &= $54..55
@@ -111,7 +107,7 @@ label                          label 0x00000002
 load_condition_mask            CondMask = $78
 copy_constant                  $0 = 0
 merge_condition_mask           CondMask = $49 & $50
-branch_if_no_lanes_active      branch_if_no_lanes_active +96 (label 1 at #210)
+branch_if_no_lanes_active      branch_if_no_lanes_active +92 (label 1 at #202)
 copy_4_uniforms                testMatrix4x4(0..3) = testInputs
 copy_4_uniforms                testMatrix4x4(4..7) = testInputs
 copy_4_uniforms                testMatrix4x4(8..11) = testInputs
@@ -146,15 +142,11 @@ copy_4_slots_masked            m₁(0..3) = Mask($1..4)
 copy_4_slots_masked            m₁(4..7) = Mask($5..8)
 copy_4_slots_masked            m₁(8..11) = Mask($9..12)
 copy_4_slots_masked            m₁(12..15) = Mask($13..16)
-splat_4_constants              $1..4 = 0
-splat_4_constants              $5..8 = 0
-splat_4_constants              $9..12 = 0
-splat_4_constants              $13..16 = 0
-copy_4_slots_unmasked          $17..20 = m₁(0..3)
-copy_4_slots_unmasked          $21..24 = m₁(4..7)
-copy_4_slots_unmasked          $25..28 = m₁(8..11)
-copy_4_slots_unmasked          $29..32 = m₁(12..15)
-sub_n_floats                   $1..16 -= $17..32
+splat_4_constants              $17..20 = 0x80000000 (-0.0)
+splat_4_constants              $21..24 = 0x80000000 (-0.0)
+splat_4_constants              $25..28 = 0x80000000 (-0.0)
+splat_4_constants              $29..32 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $1..16 ^= $17..32
 copy_4_slots_masked            m₁(0..3) = Mask($1..4)
 copy_4_slots_masked            m₁(4..7) = Mask($5..8)
 copy_4_slots_masked            m₁(8..11) = Mask($9..12)
@@ -175,15 +167,15 @@ copy_4_slots_unmasked          $1..4 = m₁(0..3)
 copy_4_slots_unmasked          $5..8 = m₁(4..7)
 copy_4_slots_unmasked          $9..12 = m₁(8..11)
 copy_4_slots_unmasked          $13..16 = m₁(12..15)
-splat_4_constants              $17..20 = 0
-splat_4_constants              $21..24 = 0
-splat_4_constants              $25..28 = 0
-splat_4_constants              $29..32 = 0
-copy_4_slots_unmasked          $33..36 = testMatrix4x4(0..3)
-copy_4_slots_unmasked          $37..40 = testMatrix4x4(4..7)
-copy_4_slots_unmasked          $41..44 = testMatrix4x4(8..11)
-copy_4_slots_unmasked          $45..48 = testMatrix4x4(12..15)
-sub_n_floats                   $17..32 -= $33..48
+copy_4_slots_unmasked          $17..20 = testMatrix4x4(0..3)
+copy_4_slots_unmasked          $21..24 = testMatrix4x4(4..7)
+copy_4_slots_unmasked          $25..28 = testMatrix4x4(8..11)
+copy_4_slots_unmasked          $29..32 = testMatrix4x4(12..15)
+splat_4_constants              $33..36 = 0x80000000 (-0.0)
+splat_4_constants              $37..40 = 0x80000000 (-0.0)
+splat_4_constants              $41..44 = 0x80000000 (-0.0)
+splat_4_constants              $45..48 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $17..32 ^= $33..48
 cmpeq_n_floats                 $1..16 = equal($1..16, $17..32)
 bitwise_and_4_ints             $9..12 &= $13..16
 bitwise_and_4_ints             $5..8 &= $9..12

diff --git a/tests/sksl/folding/MatrixScalarNoOpFolding.skrp b/tests/sksl/folding/MatrixScalarNoOpFolding.skrp
@@ -49,15 +49,15 @@ copy_constant                  $185 = 0
 copy_slot_masked               [test_no_op_scalar_X_mat2].result = Mask($185)
 mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
 load_condition_mask            CondMask = $183
-splat_4_constants              $183..186 = 0
-copy_4_slots_unmasked          $187..190 = m
-sub_4_floats                   $183..186 -= $187..190
+copy_4_slots_unmasked          $183..186 = m
+splat_4_constants              $187..190 = 0x80000000 (-0.0)
+bitwise_xor_4_ints             $183..186 ^= $187..190
 copy_4_slots_masked            m = Mask($183..186)
 store_condition_mask           $183 = CondMask
 copy_4_slots_unmasked          $184..187 = m
-splat_4_constants              $188..191 = 0
-copy_4_uniforms                $192..195 = testMatrix2x2
-sub_4_floats                   $188..191 -= $192..195
+copy_4_uniforms                $188..191 = testMatrix2x2
+splat_4_constants              $192..195 = 0x80000000 (-0.0)
+bitwise_xor_4_ints             $188..191 ^= $192..195
 cmpne_4_floats                 $184..187 = notEqual($184..187, $188..191)
 bitwise_or_2_ints              $184..185 |= $186..187
 bitwise_or_int                 $184 |= $185
@@ -157,27 +157,27 @@ copy_constant                  $155 = 0
 copy_slot_masked               [test_no_op_scalar_X_mat3].result = Mask($155)
 mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
 load_condition_mask            CondMask = $153
-splat_4_constants              $153..156 = 0
-splat_4_constants              $157..160 = 0
-copy_constant                  $161 = 0
-copy_4_slots_unmasked          $162..165 = m₁(0..3)
-copy_4_slots_unmasked          $166..169 = m₁(4..7)
-copy_slot_unmasked             $170 = m₁(8)
-sub_n_floats                   $153..161 -= $162..170
+copy_4_slots_unmasked          $153..156 = m₁(0..3)
+copy_4_slots_unmasked          $157..160 = m₁(4..7)
+copy_slot_unmasked             $161 = m₁(8)
+splat_4_constants              $162..165 = 0x80000000 (-0.0)
+splat_4_constants              $166..169 = 0x80000000 (-0.0)
+copy_constant                  $170 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $153..161 ^= $162..170
 copy_4_slots_masked            m₁(0..3) = Mask($153..156)
 copy_4_slots_masked            m₁(4..7) = Mask($157..160)
 copy_slot_masked               m₁(8) = Mask($161)
 store_condition_mask           $153 = CondMask
 copy_4_slots_unmasked          $154..157 = m₁(0..3)
 copy_4_slots_unmasked          $158..161 = m₁(4..7)
 copy_slot_unmasked             $162 = m₁(8)
-splat_4_constants              $163..166 = 0
-splat_4_constants              $167..170 = 0
-copy_constant                  $171 = 0
-copy_4_uniforms                $172..175 = testMatrix3x3(0..3)
-copy_4_uniforms                $176..179 = testMatrix3x3(4..7)
-copy_uniform                   $180 = testMatrix3x3(8)
-sub_n_floats                   $163..171 -= $172..180
+copy_4_uniforms                $163..166 = testMatrix3x3(0..3)
+copy_4_uniforms                $167..170 = testMatrix3x3(4..7)
+copy_uniform                   $171 = testMatrix3x3(8)
+splat_4_constants              $172..175 = 0x80000000 (-0.0)
+splat_4_constants              $176..179 = 0x80000000 (-0.0)
+copy_constant                  $180 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $163..171 ^= $172..180
 cmpne_n_floats                 $154..162 = notEqual($154..162, $163..171)
 bitwise_or_4_ints              $155..158 |= $159..162
 bitwise_or_2_ints              $155..156 |= $157..158
@@ -311,15 +311,15 @@ copy_constant                  $103 = 0
 copy_slot_masked               [test_no_op_scalar_X_mat4].result = Mask($103)
 mask_off_return_mask           RetMask &= ~(CondMask & LoopMask & RetMask)
 load_condition_mask            CondMask = $101
-splat_4_constants              $101..104 = 0
-splat_4_constants              $105..108 = 0
-splat_4_constants              $109..112 = 0
-splat_4_constants              $113..116 = 0
-copy_4_slots_unmasked          $117..120 = m₂(0..3)
-copy_4_slots_unmasked          $121..124 = m₂(4..7)
-copy_4_slots_unmasked          $125..128 = m₂(8..11)
-copy_4_slots_unmasked          $129..132 = m₂(12..15)
-sub_n_floats                   $101..116 -= $117..132
+copy_4_slots_unmasked          $101..104 = m₂(0..3)
+copy_4_slots_unmasked          $105..108 = m₂(4..7)
+copy_4_slots_unmasked          $109..112 = m₂(8..11)
+copy_4_slots_unmasked          $113..116 = m₂(12..15)
+splat_4_constants              $117..120 = 0x80000000 (-0.0)
+splat_4_constants              $121..124 = 0x80000000 (-0.0)
+splat_4_constants              $125..128 = 0x80000000 (-0.0)
+splat_4_constants              $129..132 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $101..116 ^= $117..132
 copy_4_slots_masked            m₂(0..3) = Mask($101..104)
 copy_4_slots_masked            m₂(4..7) = Mask($105..108)
 copy_4_slots_masked            m₂(8..11) = Mask($109..112)
@@ -329,15 +329,15 @@ copy_4_slots_unmasked          $102..105 = m₂(0..3)
 copy_4_slots_unmasked          $106..109 = m₂(4..7)
 copy_4_slots_unmasked          $110..113 = m₂(8..11)
 copy_4_slots_unmasked          $114..117 = m₂(12..15)
-splat_4_constants              $118..121 = 0
-splat_4_constants              $122..125 = 0
-splat_4_constants              $126..129 = 0
-splat_4_constants              $130..133 = 0
-copy_4_slots_unmasked          $134..137 = testMatrix4x4(0..3)
-copy_4_slots_unmasked          $138..141 = testMatrix4x4(4..7)
-copy_4_slots_unmasked          $142..145 = testMatrix4x4(8..11)
-copy_4_slots_unmasked          $146..149 = testMatrix4x4(12..15)
-sub_n_floats                   $118..133 -= $134..149
+copy_4_slots_unmasked          $118..121 = testMatrix4x4(0..3)
+copy_4_slots_unmasked          $122..125 = testMatrix4x4(4..7)
+copy_4_slots_unmasked          $126..129 = testMatrix4x4(8..11)
+copy_4_slots_unmasked          $130..133 = testMatrix4x4(12..15)
+splat_4_constants              $134..137 = 0x80000000 (-0.0)
+splat_4_constants              $138..141 = 0x80000000 (-0.0)
+splat_4_constants              $142..145 = 0x80000000 (-0.0)
+splat_4_constants              $146..149 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $118..133 ^= $134..149
 cmpne_n_floats                 $102..117 = notEqual($102..117, $118..133)
 bitwise_or_4_ints              $110..113 |= $114..117
 bitwise_or_4_ints              $106..109 |= $110..113
@@ -465,12 +465,12 @@ sub_4_floats                   $85..88 -= $89..92
 copy_4_slots_masked            m₃ = Mask($85..88)
 store_condition_mask           $85 = CondMask
 copy_4_slots_unmasked          $86..89 = m₃
-splat_4_constants              $90..93 = 0
-copy_slot_unmasked             $94 = scalar
-copy_slot_unmasked             $95 = scalar
-copy_slot_unmasked             $96 = scalar
-copy_slot_unmasked             $97 = scalar
-sub_4_floats                   $90..93 -= $94..97
+copy_slot_unmasked             $90 = scalar
+copy_slot_unmasked             $91 = scalar
+copy_slot_unmasked             $92 = scalar
+copy_slot_unmasked             $93 = scalar
+splat_4_constants              $94..97 = 0x80000000 (-0.0)
+bitwise_xor_4_ints             $90..93 ^= $94..97
 cmpne_4_floats                 $86..89 = notEqual($86..89, $90..93)
 bitwise_or_2_ints              $86..87 |= $88..89
 bitwise_or_int                 $86 |= $87
@@ -640,13 +640,13 @@ store_condition_mask           $54 = CondMask
 copy_4_slots_unmasked          $55..58 = m₄(0..3)
 copy_4_slots_unmasked          $59..62 = m₄(4..7)
 copy_slot_unmasked             $63 = m₄(8)
-splat_4_constants              $64..67 = 0
-splat_4_constants              $68..71 = 0
-copy_constant                  $72 = 0
-copy_3_slots_unmasked          $73..75 = scalar3
-copy_3_slots_unmasked          $76..78 = scalar3
-copy_3_slots_unmasked          $79..81 = scalar3
-sub_n_floats                   $64..72 -= $73..81
+copy_3_slots_unmasked          $64..66 = scalar3
+copy_3_slots_unmasked          $67..69 = scalar3
+copy_3_slots_unmasked          $70..72 = scalar3
+splat_4_constants              $73..76 = 0x80000000 (-0.0)
+splat_4_constants              $77..80 = 0x80000000 (-0.0)
+copy_constant                  $81 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $64..72 ^= $73..81
 cmpne_n_floats                 $55..63 = notEqual($55..63, $64..72)
 bitwise_or_4_ints              $56..59 |= $60..63
 bitwise_or_2_ints              $56..57 |= $58..59
@@ -856,15 +856,15 @@ copy_4_slots_unmasked          $3..6 = m₅(0..3)
 copy_4_slots_unmasked          $7..10 = m₅(4..7)
 copy_4_slots_unmasked          $11..14 = m₅(8..11)
 copy_4_slots_unmasked          $15..18 = m₅(12..15)
-splat_4_constants              $19..22 = 0
-splat_4_constants              $23..26 = 0
-splat_4_constants              $27..30 = 0
-splat_4_constants              $31..34 = 0
-copy_4_slots_unmasked          $35..38 = scalar4
-copy_4_slots_unmasked          $39..42 = scalar4
-copy_4_slots_unmasked          $43..46 = scalar4
-copy_4_slots_unmasked          $47..50 = scalar4
-sub_n_floats                   $19..34 -= $35..50
+copy_4_slots_unmasked          $19..22 = scalar4
+copy_4_slots_unmasked          $23..26 = scalar4
+copy_4_slots_unmasked          $27..30 = scalar4
+copy_4_slots_unmasked          $31..34 = scalar4
+splat_4_constants              $35..38 = 0x80000000 (-0.0)
+splat_4_constants              $39..42 = 0x80000000 (-0.0)
+splat_4_constants              $43..46 = 0x80000000 (-0.0)
+splat_4_constants              $47..50 = 0x80000000 (-0.0)
+bitwise_xor_n_ints             $19..34 ^= $35..50
 cmpne_n_floats                 $3..18 = notEqual($3..18, $19..34)
 bitwise_or_4_ints              $11..14 |= $15..18
 bitwise_or_4_ints              $7..10 |= $11..14