fix: make service creation timeouts in a cluster more robust (CloudNetService#1085)

derklaro · web-flow · commit 637243964353 · 2023-01-26T18:01:37.000+01:00
### Motivation The current handling of service starts has a small issue in the way services are started: ```mermaid flowchart TB A["Head Node"] -->|Start Request| B["Node to start on"] B -- Create Result --> A B -->|Publish| C["All other components"] ``` The head node waits for the node to start the service for 5 seconds, and if nothing happended will try to re-create the service or continue with the normal ticking (which might trigger a new service start try). However, the remote node is unaware that the service creation timed out and will still register the service locally and publish its info the cluster, which might lead to duplicate service creations. ### Modification The new system is more aware of delays and handles the creation mistakes much better: ```mermaid flowchart TB A1["Head Node"] -->Z1["Start request with timeout (20 seconds)"] Z1-->|To target node| A2 C1["Head Node Register Try"] C1 -->|"Success (Send by Head Node to Target Node)"| D2 C1 -->|Failure| G2 Z1 -->|Timeout| G2 C2-.->|"TTL exceeded"| G2 A2["Request received"]-->B2["Service created"]-->C2["Register to waiting services (TTL: 1 Minute)"]-->|Responds with Create Result| C1 D2["Removal from unaccepted services"] D2-->|Success| E2["Register as local Service"] D2-->|TTL exceeded| H2["Unregister from Head Node"] G2["Auto remove"] E2-->|Publish of Service Info| F2["All other components"] ``` With that way the head node takes full control over service creations and no longer allows a node to do things independent from the head node. That allows us to ensure that service registrations in the cluster happen once, and only once without any side effects for later service starts which are requested by the head node. ### Result Services should no longer get registered as "ghosts" but only fully controlled by the head node, and removed properly in case a service creation timeout occurs. ##### Other context Fixes CloudNetService#994
diff --git a/node/src/main/java/eu/cloudnetservice/node/network/listener/message/ServiceChannelMessageListener.java b/node/src/main/java/eu/cloudnetservice/node/network/listener/message/ServiceChannelMessageListener.java
@@ -72,6 +72,14 @@ public void handleChannelMessage(@NonNull ChannelMessageReceiveEvent event) {
               .build()));
         }
 
+        // feedback from a node that a service which should have been moved to accepted
+        // is no longer registered as unaccepted and not yet moved to registered, which
+        // means that the cache ttl on the target node exceeded
+        case "node_to_head_node_unaccepted_service_ttl_exceeded" -> {
+          var serviceUniqueId = event.content().readUniqueId();
+          this.serviceManager.forceRemoveRegisteredService(serviceUniqueId);
+        }
+
         // request to start a service on the local node
         case "head_node_to_node_start_service" -> {
           var configuration = event.content().readObject(ServiceConfiguration.class);
@@ -80,6 +88,31 @@ public void handleChannelMessage(@NonNull ChannelMessageReceiveEvent event) {
           event.binaryResponse(DataBuf.empty().writeObject(ServiceCreateResult.created(service.serviceInfo())));
         }
 
+        // publish the service info of a created service to the cluster
+        case "head_node_to_node_finish_service_registration" -> {
+          var serviceUniqueId = event.content().readUniqueId();
+          var service = this.serviceManager.takeUnacceptedService(serviceUniqueId);
+
+          if (service != null) {
+            // service is still locally present, finish the registration of it
+            service.handleServiceRegister();
+          } else {
+            // service is no longer locally present as unaccepted
+            // re-check if the service was already moved to registered
+            var registeredService = this.serviceManager.localCloudService(serviceUniqueId);
+            if (registeredService == null) {
+              // send this as feedback to the head node in order to remove the registered service from there as well
+              ChannelMessage.builder()
+                .target(event.sender().toTarget())
+                .channel(NetworkConstants.INTERNAL_MSG_CHANNEL)
+                .message("node_to_head_node_unaccepted_service_ttl_exceeded")
+                .buffer(DataBuf.empty().writeUniqueId(serviceUniqueId))
+                .build()
+                .send();
+            }
+          }
+        }
+
         // update of a service in the network
         case "update_service_info" -> {
           var snapshot = event.content().readObject(ServiceInfoSnapshot.class);
diff --git a/node/src/main/java/eu/cloudnetservice/node/service/CloudService.java b/node/src/main/java/eu/cloudnetservice/node/service/CloudService.java
@@ -71,6 +71,9 @@ public interface CloudService extends SpecificCloudServiceProvider {
 
   void publishServiceInfoSnapshot();
 
+  @ApiStatus.Internal
+  void handleServiceRegister();
+
   @ApiStatus.Internal
   void updateServiceInfoSnapshot(@NonNull ServiceInfoSnapshot serviceInfoSnapshot);
 }
diff --git a/node/src/main/java/eu/cloudnetservice/node/service/CloudServiceManager.java b/node/src/main/java/eu/cloudnetservice/node/service/CloudServiceManager.java
@@ -31,7 +31,6 @@
 import lombok.NonNull;
 import org.jetbrains.annotations.ApiStatus;
 import org.jetbrains.annotations.Nullable;
-import org.jetbrains.annotations.UnknownNullability;
 import org.jetbrains.annotations.Unmodifiable;
 import org.jetbrains.annotations.UnmodifiableView;
 
@@ -95,7 +94,21 @@ void addServicePreparer(
   void unregisterLocalService(@NonNull CloudService service);
 
   @ApiStatus.Internal
-  void handleServiceUpdate(@NonNull ServiceInfoSnapshot snapshot, @UnknownNullability NetworkChannel source);
+  void registerUnacceptedService(@NonNull CloudService service);
+
+  @ApiStatus.Internal
+  @Nullable CloudService takeUnacceptedService(@NonNull UUID serviceUniqueId);
+
+  @ApiStatus.Internal
+  void forceRemoveRegisteredService(@NonNull UUID uniqueId);
+
+  @ApiStatus.Internal
+  @Nullable SpecificCloudServiceProvider registerService(
+    @NonNull ServiceInfoSnapshot snapshot,
+    @NonNull NetworkChannel source);
+
+  @ApiStatus.Internal
+  void handleServiceUpdate(@NonNull ServiceInfoSnapshot snapshot, @Nullable NetworkChannel source);
 
   @ApiStatus.Internal
   @NonNull CloudService createLocalCloudService(@NonNull ServiceConfiguration serviceConfiguration);
diff --git a/node/src/main/java/eu/cloudnetservice/node/service/defaults/AbstractService.java b/node/src/main/java/eu/cloudnetservice/node/service/defaults/AbstractService.java
@@ -153,10 +153,10 @@ protected AbstractService(
       -1,
       ServiceLifeCycle.PREPARED,
       configuration.properties().clone());
-    this.pushServiceInfoSnapshotUpdate(ServiceLifeCycle.PREPARED);
+    this.pushServiceInfoSnapshotUpdate(ServiceLifeCycle.PREPARED, false);
 
-    manager.registerLocalService(this);
-    eventManager.callEvent(new CloudServiceCreateEvent(this));
+    // register the service locally for now
+    manager.registerUnacceptedService(this);
   }
 
   protected static @NonNull Path resolveServicePath(
@@ -531,6 +531,19 @@ public void publishServiceInfoSnapshot() {
       .send();
   }
 
+  @Override
+  public void handleServiceRegister() {
+    // just ensure that this service is removed from the cache & moved to a "real" registered local service
+    this.cloudServiceManager.registerLocalService(this);
+    this.cloudServiceManager.takeUnacceptedService(this.serviceId().uniqueId());
+
+    // publish the initial service info to the cluster
+    this.pushServiceInfoSnapshotUpdate(ServiceLifeCycle.PREPARED);
+
+    // notify the local listeners that this service was created
+    this.eventManager.callEvent(new CloudServiceCreateEvent(this));
+  }
+
   @Override
   public void updateServiceInfoSnapshot(@NonNull ServiceInfoSnapshot serviceInfoSnapshot) {
     this.lastServiceInfo = this.currentServiceInfo;
diff --git a/node/src/main/java/eu/cloudnetservice/node/service/defaults/DefaultCloudServiceManager.java b/node/src/main/java/eu/cloudnetservice/node/service/defaults/DefaultCloudServiceManager.java
@@ -16,6 +16,8 @@
 
 package eu.cloudnetservice.node.service.defaults;
 
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
 import com.google.common.collect.ComparisonChain;
 import dev.derklaro.aerogel.PostConstruct;
 import dev.derklaro.aerogel.auto.Provides;
@@ -61,6 +63,7 @@
 import jakarta.inject.Named;
 import jakarta.inject.Singleton;
 import java.nio.file.Path;
+import java.time.Duration;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@@ -73,7 +76,6 @@
 import java.util.stream.Collectors;
 import lombok.NonNull;
 import org.jetbrains.annotations.Nullable;
-import org.jetbrains.annotations.UnknownNullability;
 import org.jetbrains.annotations.Unmodifiable;
 import org.jetbrains.annotations.UnmodifiableView;
 
@@ -96,6 +98,10 @@ public class DefaultCloudServiceManager implements CloudServiceManager {
   protected final CloudServiceFactory cloudServiceFactory;
 
   protected final Map<UUID, SpecificCloudServiceProvider> knownServices = new ConcurrentHashMap<>();
+  protected final Cache<UUID, CloudService> localUnacceptedServices = Caffeine.newBuilder()
+    .expireAfterWrite(Duration.ofMinutes(1))
+    .build();
+
   protected final Map<String, LocalCloudServiceFactory> cloudServiceFactories = new ConcurrentHashMap<>();
   protected final Map<ServiceEnvironmentType, ServiceConfigurationPreparer> preparers = new ConcurrentHashMap<>();
 
@@ -441,7 +447,48 @@ public void unregisterLocalService(@NonNull CloudService service) {
   }
 
   @Override
-  public void handleServiceUpdate(@NonNull ServiceInfoSnapshot snapshot, @UnknownNullability NetworkChannel source) {
+  public void registerUnacceptedService(@NonNull CloudService service) {
+    this.localUnacceptedServices.put(service.serviceId().uniqueId(), service);
+  }
+
+  @Override
+  public @Nullable CloudService takeUnacceptedService(@NonNull UUID serviceUniqueId) {
+    // this is the correct way to invalidate & get the value associated with the id in the cache
+    // see https://stackoverflow.com/a/67994912/13008679
+    return this.localUnacceptedServices.asMap().remove(serviceUniqueId);
+  }
+
+  @Override
+  public void forceRemoveRegisteredService(@NonNull UUID uniqueId) {
+    this.knownServices.remove(uniqueId);
+  }
+
+  @Override
+  public @Nullable SpecificCloudServiceProvider registerService(
+    @NonNull ServiceInfoSnapshot snapshot,
+    @NonNull NetworkChannel source
+  ) {
+    // check if the service provider is already registered, return null to indicate that we didn't register the service
+    var serviceUniqueId = snapshot.serviceId().uniqueId();
+    if (this.knownServices.containsKey(serviceUniqueId)) {
+      return null;
+    }
+
+    // build the service provider for the newly added service
+    var serviceProvider = this.sender.factory().generateRPCChainBasedApi(
+      this.sender,
+      "serviceProvider",
+      SpecificCloudServiceProvider.class,
+      GenerationContext.forClass(RemoteNodeCloudServiceProvider.class).channelSupplier(() -> source).build()
+    ).newInstance(new Object[]{snapshot}, new Object[]{snapshot.serviceId().uniqueId()});
+
+    // register the service and return the new provider, unless some other thread registered the service
+    var knownProvider = this.knownServices.putIfAbsent(serviceUniqueId, serviceProvider);
+    return knownProvider == null ? serviceProvider : null;
+  }
+
+  @Override
+  public void handleServiceUpdate(@NonNull ServiceInfoSnapshot snapshot, @Nullable NetworkChannel source) {
     // deleted services were removed on the other node - remove it here too
     if (snapshot.lifeCycle() == ServiceLifeCycle.DELETED) {
       this.knownServices.remove(snapshot.serviceId().uniqueId());
@@ -450,14 +497,9 @@ public void handleServiceUpdate(@NonNull ServiceInfoSnapshot snapshot, @UnknownN
       // register the service if the provider is available
       var provider = this.knownServices.get(snapshot.serviceId().uniqueId());
       if (provider == null) {
-        this.knownServices.putIfAbsent(
-          snapshot.serviceId().uniqueId(),
-          this.sender.factory().generateRPCChainBasedApi(
-            this.sender,
-            "serviceProvider",
-            SpecificCloudServiceProvider.class,
-            GenerationContext.forClass(RemoteNodeCloudServiceProvider.class).channelSupplier(() -> source).build()
-          ).newInstance(new Object[]{snapshot}, new Object[]{snapshot.serviceId().uniqueId()}));
+        // this is the only point where the channel has to be present
+        Objects.requireNonNull(source, "Node Network Channel has to be present to register service");
+        this.registerService(snapshot, source);
         LOGGER.fine("Registered remote service %s", null, snapshot.serviceId());
       } else if (provider instanceof RemoteNodeCloudServiceProvider remoteProvider) {
         // update the provider if possible - we need only to handle remote node providers as local providers will update
diff --git a/node/src/main/java/eu/cloudnetservice/node/service/defaults/NodeCloudServiceFactory.java b/node/src/main/java/eu/cloudnetservice/node/service/defaults/NodeCloudServiceFactory.java
@@ -18,6 +18,8 @@
 
 import dev.derklaro.aerogel.PostConstruct;
 import dev.derklaro.aerogel.auto.Provides;
+import eu.cloudnetservice.common.log.LogManager;
+import eu.cloudnetservice.common.log.Logger;
 import eu.cloudnetservice.driver.channel.ChannelMessage;
 import eu.cloudnetservice.driver.channel.ChannelMessageTarget;
 import eu.cloudnetservice.driver.event.EventManager;
@@ -31,6 +33,7 @@
 import eu.cloudnetservice.driver.service.ServiceConfiguration;
 import eu.cloudnetservice.driver.service.ServiceCreateResult;
 import eu.cloudnetservice.driver.service.ServiceCreateRetryConfiguration;
+import eu.cloudnetservice.node.cluster.NodeServer;
 import eu.cloudnetservice.node.cluster.NodeServerProvider;
 import eu.cloudnetservice.node.event.service.CloudServiceNodeSelectEvent;
 import eu.cloudnetservice.node.network.listener.message.ServiceChannelMessageListener;
@@ -51,6 +54,8 @@
 @Provides(CloudServiceFactory.class)
 public class NodeCloudServiceFactory implements CloudServiceFactory {
 
+  private static final Logger LOGGER = LogManager.logger(NodeCloudServiceFactory.class);
+
   private final EventManager eventManager;
   private final CloudServiceManager serviceManager;
   private final NodeServerProvider nodeServerProvider;
@@ -128,10 +133,10 @@ private void registerServiceChannelListener() {
             "head_node_to_node_start_service",
             nodeServer.info().uniqueId(),
             serviceConfiguration);
+
+          // process the service creation result and return it if the creation was successful
+          createResult = this.processServiceStartResponse(createResult, nodeServer);
           if (createResult.state() == ServiceCreateResult.State.CREATED) {
-            // register the service locally in case the registration packet was not sent before a response to this
-            // packet was received
-            this.serviceManager.handleServiceUpdate(createResult.serviceInfo(), nodeServer.channel());
             return createResult;
           }
 
@@ -140,9 +145,12 @@ private void registerServiceChannelListener() {
             maybeServiceConfiguration.retryConfiguration(),
             serviceConfiguration);
         } else {
-          // start on the current node
-          var serviceInfo = this.serviceManager.createLocalCloudService(serviceConfiguration).serviceInfo();
-          return ServiceCreateResult.created(serviceInfo);
+          // start on the current node & publish the service snapshot to all components
+          var createdService = this.serviceManager.createLocalCloudService(serviceConfiguration);
+          createdService.handleServiceRegister();
+
+          // construct the create result
+          return ServiceCreateResult.created(createdService.serviceInfo());
         }
       } finally {
         this.serviceCreationLock.unlock();
@@ -156,6 +164,45 @@ private void registerServiceChannelListener() {
     }
   }
 
+  protected @NonNull ServiceCreateResult processServiceStartResponse(
+    @NonNull ServiceCreateResult result,
+    @NonNull NodeServer associatedNode
+  ) {
+    // if the service creation failed we don't need to check anything
+    if (result.state() != ServiceCreateResult.State.CREATED) {
+      return result;
+    }
+
+    // check if the node is still connected
+    var nodeChannel = associatedNode.channel();
+    if (nodeChannel == null || !associatedNode.available()) {
+      LOGGER.fine(
+        "Unable to register service on node %s as the node is no longer connected",
+        null,
+        associatedNode.info().uniqueId());
+      return ServiceCreateResult.FAILED;
+    }
+
+    // try to register the created service locally
+    var serviceUniqueId = result.serviceInfo().serviceId().uniqueId();
+    var serviceProvider = this.serviceManager.registerService(result.serviceInfo(), nodeChannel);
+    if (serviceProvider != null) {
+      // service registered successfully, finish the registration of the service on the other node
+      ChannelMessage.builder()
+        .channel(NetworkConstants.INTERNAL_MSG_CHANNEL)
+        .message("head_node_to_node_finish_service_registration")
+        .buffer(DataBuf.empty().writeUniqueId(serviceUniqueId))
+        .target(ChannelMessageTarget.Type.NODE, associatedNode.info().uniqueId())
+        .build()
+        .send();
+      return result;
+    } else {
+      // a service with the id already exists, just let the unaccepted service
+      // time out on the other node... ¯\_(ツ)_/¯
+      return ServiceCreateResult.FAILED;
+    }
+  }
+
   protected @NonNull ServiceCreateResult sendNodeServerStartRequest(
     @NonNull String message,
     @NonNull String targetNode,
@@ -169,7 +216,7 @@ private void registerServiceChannelListener() {
       .buffer(DataBuf.empty().writeObject(configuration))
       .build()
       .sendSingleQueryAsync()
-      .get(5, TimeUnit.SECONDS, null);
+      .get(20, TimeUnit.SECONDS, null);
 
     // read the result service info from the buffer, if the there was no response then we need to fail (only the head
     // node should queue start requests)

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,9 @@ public interface CloudService extends SpecificCloudServiceProvider {`
`71`	`71`
`72`	`72`	`void publishServiceInfoSnapshot();`
`73`	`73`
	`74`	`+ @ApiStatus.Internal`
	`75`	`+ void handleServiceRegister();`
	`76`	`+`
`74`	`77`	`@ApiStatus.Internal`
`75`	`78`	`void updateServiceInfoSnapshot(@NonNull ServiceInfoSnapshot serviceInfoSnapshot);`
`76`	`79`	`}`