lightningnetwork · GustavoStingelin · Jul 1, 2025 · Jul 1, 2025 · Jul 10, 2025 · yyforyongyu
diff --git a/docs/release-notes/release-notes-0.20.0.md b/docs/release-notes/release-notes-0.20.0.md
@@ -33,6 +33,9 @@
   known TLV fields were incorrectly encoded into the `ExtraData` field of
   messages in the dynamic commitment set.
 
+- Fixed [zombie channel leak](https://github.com/lightningnetwork/lnd/pull/10015)
+  in the memory graph, resulting in incorrect path finding and memory usage.
+
 # New Features
 
 ## Functional Enhancements
@@ -110,7 +113,6 @@ circuit. The indices are only available for forwarding events saved after v0.20.
   * Add graph SQL migration logic:
     * [1](https://github.com/lightningnetwork/lnd/pull/10036)
     * [2](https://github.com/lightningnetwork/lnd/pull/10050)
-    * [3](https://github.com/lightningnetwork/lnd/pull/10038)
 
 ## RPC Updates
 * Previously the `RoutingPolicy` would return the inbound fee record in its
@@ -119,17 +121,8 @@ circuit. The indices are only available for forwarding events saved after v0.20.
   [fixed](https://github.com/lightningnetwork/lnd/pull/9572), the affected RPCs
   are `SubscribeChannelGraph`, `GetChanInfo`, `GetNodeInfo` and `DescribeGraph`.
 
-* [Fix a bug](https://github.com/lightningnetwork/lnd/pull/10064) where the
-  `GetChanInfo` was not returning the correct gRPC status code in the cases 
-  where the channel is unknown to the node. The same is done for `LookupInvoice`
-  for the case where the DB is kvdb backed and no invoices have yet been added
-  to the database.
 
 ## lncli Updates
-* Previously, users could only specify one `outgoing_chan_id` when calling the 
-  `lncli queryroutes` or the QueryRoutes RPC. With this change, multiple 
-  `outgoing_chan_id` can be passed during the call.
-
 
 ## Code Health
 
@@ -163,10 +156,6 @@ reader of a payment request.
 | [`routerrpc.SendPaymentV2`](https://lightning.engineering/api-docs/api/lnd/router/send-payment-v2/index.html) | ✅ | `POST` | `/v2/router/send` |
 | [`routerrpc.TrackPaymentV2`](https://lightning.engineering/api-docs/api/lnd/router/track-payment-v2/) | ✅ | `GET` | `/v2/router/track/{payment_hash}` |
 
-* We are deprecating `OutgoingChanId` in favour of `OutgoingChanIds` in the
-  `QueryRoutes` RPC. This [transition](https://github.com/lightningnetwork/lnd/pull/10057) allows us to specify more than one outgoing channel
-  the pathfinder should use when finding a route.
-
 # Technical and Architectural Updates
 ## BOLT Spec Updates
 
@@ -206,6 +195,5 @@ reader of a payment request.
 * Funyug
 * Mohamed Awnallah
 * Pins
-* Torkel Rogstad
 * Yong Yu
 * Ziggie
diff --git a/graph/db/graph.go b/graph/db/graph.go
@@ -79,6 +79,7 @@ func (c *ChannelGraph) Start() error {
 			return fmt.Errorf("could not populate the graph "+
 				"cache: %w", err)
 		}
+		c.graphCache.Start()
 	}
 
 	c.wg.Add(1)
@@ -96,6 +97,10 @@ func (c *ChannelGraph) Stop() error {
 	log.Debugf("ChannelGraph shutting down...")
 	defer log.Debug("ChannelGraph shutdown complete")
 
+	if c.graphCache != nil {
+		c.graphCache.Stop()
+	}
+
 	close(c.quit)
 	c.wg.Wait()
 

diff --git a/graph/db/graph_cache.go b/graph/db/graph_cache.go
@@ -3,6 +3,7 @@ package graphdb
 import (
 	"fmt"
 	"sync"
+	"time"
 
 	"github.com/btcsuite/btcd/btcutil"
 	"github.com/lightningnetwork/lnd/graph/db/models"
@@ -66,7 +67,18 @@ type GraphCache struct {
 	nodeChannels map[route.Vertex]map[uint64]*DirectedChannel
 	nodeFeatures map[route.Vertex]*lnwire.FeatureVector
 
-	mtx sync.RWMutex
+	// zombieIndex tracks channels that may be leaked during the removal
+	// process. Since the remover could not have the node ID, these channels
+	// are stored here and will be removed later in a separate loop.
+	zombieIndex map[uint64]struct{}
+
+	// zombieCleanerInterval is the interval at which the zombie cleaner
+	// runs to clean up channels that are still missing their nodes.
+	zombieCleanerInterval time.Duration
+
+	mtx  sync.RWMutex
+	quit chan struct{}
+	wg   sync.WaitGroup
 }
 
 // NewGraphCache creates a new graphCache.
@@ -83,6 +95,9 @@ func NewGraphCache(preAllocNumNodes int) *GraphCache {
 			map[route.Vertex]*lnwire.FeatureVector,
 			preAllocNumNodes,
 		),
+		zombieIndex:           make(map[uint64]struct{}),
+		zombieCleanerInterval: time.Hour,
+		quit:                  make(chan struct{}),
 	}
 }
 
@@ -100,6 +115,81 @@ func (c *GraphCache) Stats() string {
 		numChannels)
 }
 
+// Start launches the background goroutine that periodically cleans up zombie
+// channels.
+func (c *GraphCache) Start() {
+	c.wg.Add(1)
+	go c.zombieCleaner()
+}
+
+// Stop signals the background cleaner to shut down and waits for it to exit.
+func (c *GraphCache) Stop() {
+	close(c.quit)
+	c.wg.Wait()
+}
+
+// zombieCleaner periodically iterates over the zombie index and removes
+// channels that are still missing their nodes.
+//
+// NOTE: must be run as a goroutine.
+func (c *GraphCache) zombieCleaner() {
+	defer c.wg.Done()
+
+	ticker := time.NewTicker(c.zombieCleanerInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ticker.C:
+			c.cleanupZombies()
+		case <-c.quit:
+			return
+		}
+	}
+}
+
+// cleanupZombies attempts to prune channels tracked in the zombie index. If the
+// nodes for a channel still cannot be resolved, the channel is deleted from the
+// cache.
+func (c *GraphCache) cleanupZombies() {
+	c.mtx.Lock()
+	defer c.mtx.Unlock()
+
+	if len(c.zombieIndex) == 0 {
+		log.Debug("no zombie channels to clean up")
+		return
+	}
+
+	// Go through all nodes and their channels once to check if any are
+	// marked as zombies. This is faster than checking every node for each
+	// zombie channel, since there are usually many more nodes than zombie
+	// channels.
+	for node, chans := range c.nodeChannels {
+		for cid, ch := range chans {
+			// if the channel isn't a zombie, we can skip it.
+			if _, ok := c.zombieIndex[cid]; !ok {
+				continue
+			}
+
+			// delete peer's side channel if it exists.
+			c.removeChannelIfFound(ch.OtherNode, cid)
+
+			// delete the channel from our side.
+			delete(chans, cid)
+		}
+
+		// If all channels were deleted for this node, clean up the map
+		// entry entirely.
+		if len(chans) == 0 {
+			delete(c.nodeChannels, node)
+		}
+	}
+
+	// Now that we have removed all channels that were zombies, we can
+	// clear the zombie index.
+	c.zombieIndex = make(map[uint64]struct{})
+}
+
 // AddNodeFeatures adds a graph node and its features to the cache.
 func (c *GraphCache) AddNodeFeatures(node route.Vertex,
 	features *lnwire.FeatureVector) {
@@ -251,6 +341,15 @@ func (c *GraphCache) RemoveChannel(node1, node2 route.Vertex, chanID uint64) {
 	// Remove that one channel from both sides.
 	c.removeChannelIfFound(node1, chanID)
 	c.removeChannelIfFound(node2, chanID)
+
+	zeroVertex := route.Vertex{}
+	if node1 == zeroVertex || node2 == zeroVertex {
+		// If one of the nodes is the zero vertex, it means that we will
+		// leak the channel in the memory cache, since we don't have the
+		// node ID to remove, so we add it to the zombie index to post
+		// removal.
+		c.zombieIndex[chanID] = struct{}{}
+	}
 }
 
 // removeChannelIfFound removes a single channel from one side.
@@ -290,7 +389,13 @@ func (c *GraphCache) getChannels(node route.Vertex) []*DirectedChannel {
 
 	i := 0
 	channelsCopy := make([]*DirectedChannel, len(channels))
-	for _, channel := range channels {
+	for cid, channel := range channels {
+		if _, ok := c.zombieIndex[cid]; ok {
+			// If this channel is a zombie, we don't want to return
+			// it, so we skip it.
+			continue
+		}
+
 		// We need to copy the channel and policy to avoid it being
 		// updated in the cache if the path finding algorithm sets
 		// fields on it (currently only the ToNodeFeatures of the
@@ -305,6 +410,9 @@ func (c *GraphCache) getChannels(node route.Vertex) []*DirectedChannel {
 		i++
 	}
 
+	// Copy the slice to clean up the unused pre allocated tail entries.
+	copy(channelsCopy, channelsCopy[:i])
+
 	return channelsCopy
 }