@@ -700,6 +700,67 @@ void lowerBrentKungPrefixTree(OpBuilder &builder, Location loc,
700700 });
701701}
702702
703+ // TODO: Generalize to other parallel prefix trees.
704+ class LazyKoggeStonePrefixTree {
705+ public:
706+ LazyKoggeStonePrefixTree (OpBuilder &builder, Location loc, int64_t width,
707+ ArrayRef<Value> pPrefix, ArrayRef<Value> gPrefix )
708+ : builder(builder), loc(loc), width(width) {
709+ assert (width > 0 && " width must be positive" );
710+ for (size_t i = 0 ; i < static_cast <size_t >(width); ++i)
711+ prefixCache[{0 , i}] = {pPrefix[i], gPrefix [i]};
712+ }
713+
714+ // Get the final group and propagate values for bit i.
715+ std::pair<Value, Value> getFinal (int64_t i) {
716+ assert (i >= 0 && i < width && " i out of bounds" );
717+ // Final level is ceil(log2(width)) in Kogge-Stone.
718+ return getGroupAndPropagate (llvm::Log2_64_Ceil (width), i);
719+ }
720+
721+ private:
722+ // Recursively get the group and propagate values for bit i at level `level`.
723+ // Level 0 is the initial level with the input propagate and generate values.
724+ // Level n computes the group and propagate values for a stride of 2^(n-1).
725+ // Uses memoization to cache intermediate results.
726+ std::pair<Value, Value> getGroupAndPropagate (int64_t level, int64_t i);
727+ OpBuilder &builder;
728+ Location loc;
729+ int64_t width;
730+ DenseMap<std::pair<int64_t , int64_t >, std::pair<Value, Value>> prefixCache;
731+ };
732+
733+ std::pair<Value, Value>
734+ LazyKoggeStonePrefixTree::getGroupAndPropagate (int64_t level, int64_t i) {
735+ assert (i < static_cast <int64_t >(width) && " i out of bounds" );
736+ auto key = std::make_pair (level, i);
737+ auto it = prefixCache.find (key);
738+ if (it != prefixCache.end ())
739+ return it->second ;
740+
741+ assert (level > 0 && " level must be positive" );
742+
743+ int64_t previousStride = 1ULL << (level - 1 );
744+ if (i < previousStride) {
745+ // No dependency, just copy from the previous level.
746+ auto [propagateI, generateI] = getGroupAndPropagate (level - 1 , i);
747+ prefixCache[key] = {propagateI, generateI};
748+ return prefixCache[key];
749+ }
750+ // Get the dependency index.
751+ int64_t j = i - previousStride;
752+ auto [propagateI, generateI] = getGroupAndPropagate (level - 1 , i);
753+ auto [propagateJ, generateJ] = getGroupAndPropagate (level - 1 , j);
754+ // Group generate: g_i OR (p_i AND g_j)
755+ Value andPG = comb::AndOp::create (builder, loc, propagateI, generateJ);
756+ Value newGenerate = comb::OrOp::create (builder, loc, generateI, andPG);
757+ // Group propagate: p_i AND p_j
758+ Value newPropagate =
759+ comb::AndOp::create (builder, loc, propagateI, propagateJ);
760+ prefixCache[key] = {newPropagate, newGenerate};
761+ return prefixCache[key];
762+ }
763+
703764template <bool lowerToMIG>
704765struct CombAddOpConversion : OpConversionPattern<AddOp> {
705766 using OpConversionPattern<AddOp>::OpConversionPattern;
@@ -1080,37 +1141,49 @@ struct CombICmpOpConversion : OpConversionPattern<ICmpOp> {
10801141 // need the final result. Optimizing this to skip intermediate computations
10811142 // is non-trivial because each iteration depends on results from previous
10821143 // iterations. We rely on DCE passes to remove unused operations.
1083- // TODO: Lazily compute only the required prefix values.
1144+ // TODO: Lazily compute only the required prefix values. Kogge-Stone is
1145+ // already implemented in a lazy manner below, but other architectures can
1146+ // also be optimized.
10841147 static Value computePrefixComparison (ConversionPatternRewriter &rewriter,
10851148 Location loc, SmallVector<Value> pPrefix,
10861149 SmallVector<Value> gPrefix ,
10871150 bool includeEq, AdderArchitecture arch) {
10881151 auto width = pPrefix.size ();
1152+ Value finalGroup, finalPropagate;
10891153 // Apply the appropriate prefix tree algorithm
10901154 switch (arch) {
10911155 case AdderArchitecture::RippleCarry:
10921156 llvm_unreachable (" Ripple-Carry should be handled separately" );
10931157 break ;
1094- case AdderArchitecture::Sklanskey:
1158+ case AdderArchitecture::Sklanskey: {
10951159 lowerSklanskeyPrefixTree (rewriter, loc, pPrefix, gPrefix );
1160+ finalGroup = gPrefix [width - 1 ];
1161+ finalPropagate = pPrefix[width - 1 ];
10961162 break ;
1163+ }
10971164 case AdderArchitecture::KoggeStone:
1098- lowerKoggeStonePrefixTree (rewriter, loc, pPrefix, gPrefix );
1165+ // Use lazy Kogge-Stone implementation to avoid computing all
1166+ // intermediate prefix values.
1167+ std::tie (finalPropagate, finalGroup) =
1168+ LazyKoggeStonePrefixTree (rewriter, loc, width, pPrefix, gPrefix )
1169+ .getFinal (width - 1 );
10991170 break ;
1100- case AdderArchitecture::BrentKung:
1171+ case AdderArchitecture::BrentKung: {
11011172 lowerBrentKungPrefixTree (rewriter, loc, pPrefix, gPrefix );
1173+ finalGroup = gPrefix [width - 1 ];
1174+ finalPropagate = pPrefix[width - 1 ];
11021175 break ;
11031176 }
1177+ }
11041178
11051179 // Final result: gPrefix[width-1] gives us "a < b"
11061180 if (includeEq) {
11071181 // a <= b iff (a < b) OR (a == b)
11081182 // a == b iff pPrefix[width-1] (all bits are equal)
1109- return comb::OrOp::create (rewriter, loc, gPrefix [width - 1 ],
1110- pPrefix[width - 1 ]);
1183+ return comb::OrOp::create (rewriter, loc, finalGroup, finalPropagate);
11111184 }
11121185 // a < b iff gPrefix[width-1]
1113- return gPrefix [width - 1 ] ;
1186+ return finalGroup ;
11141187 }
11151188
11161189 // Construct an unsigned comparator using either ripple-carry or
0 commit comments