akka · jroper · May 13, 2019 · patriknw · May 31, 2019 · jroper
diff --git a/cluster-bootstrap/src/main/resources/reference.conf b/cluster-bootstrap/src/main/resources/reference.conf
@@ -99,9 +99,14 @@ akka.management {
     # Configured how we communicate with the contact point once it is discovered
     contact-point {
 
+      # The probe method. Valid values are akka-management and remoting. If akka-management, will use akka-management
+      # HTTP interface to discover seed nodes. If remoting, will use Akka remoting to discover seed nodes.
+      probe-method = "akka-management"
+
       # If no port is discovered along with the host/ip of a contact point this port will be used as fallback
       # Also, when no port-name is used and multiple results are returned for a given service, this port is
-      # used to disambiguate. When set to <fallback-port>, defaults to the value of akka.management.http.port
+      # used to disambiguate. When set to <fallback-port>, defaults to the value of akka.management.http.port.
+      # Only used by http probe method, for remoting, the cluster remoting port is used.
       fallback-port = "<fallback-port>" # port pun, it "complements" 2552 which is often used for Akka remoting
 
       # If some discovered seed node will keep failing to connect for specified period of time,
@@ -126,3 +131,16 @@ akka.management {
   }
 
 }
+
+akka.actor {
+  serializers {
+    akka-management-cluster-bootstrap = "akka.management.cluster.bootstrap.internal.BootstrapProtocolSerializer"
+  }
+  serialization-bindings {
+    "akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol$SeedNodes" = akka-management-cluster-bootstrap
+    "akka.management.cluster.bootstrap.internal.RemotingContactPoint$GetSeedNodes$" = akka-management-cluster-bootstrap
+  }
+  serialization-identifiers {
+    "akka.management.cluster.bootstrap.internal.BootstrapProtocolSerializer" = 8788
+  }
+}
diff --git a/cluster-bootstrap/src/main/scala/akka/management/cluster/bootstrap/ClusterBootstrap.scala b/cluster-bootstrap/src/main/scala/akka/management/cluster/bootstrap/ClusterBootstrap.scala
@@ -7,29 +7,28 @@ package akka.management.cluster.bootstrap
 import java.util.concurrent.atomic.AtomicReference
 
 import akka.AkkaVersion
+
 import scala.concurrent.Future
 import scala.concurrent.Promise
-
 import akka.actor.ActorSystem
 import akka.actor.ExtendedActorSystem
 import akka.actor.Extension
 import akka.actor.ExtensionId
 import akka.actor.ExtensionIdProvider
 import akka.annotation.InternalApi
 import akka.cluster.Cluster
-import akka.discovery.{ Discovery, ServiceDiscovery }
+import akka.discovery.{Discovery, ServiceDiscovery}
 import akka.event.Logging
 import akka.http.scaladsl.model.Uri
 import akka.http.scaladsl.server.Route
 import akka.management.cluster.bootstrap.contactpoint.HttpClusterBootstrapRoutes
-import akka.management.cluster.bootstrap.internal.BootstrapCoordinator
+import akka.management.cluster.bootstrap.internal.{BootstrapCoordinator, RemotingContactPoint}
 import akka.management.scaladsl.ManagementRouteProviderSettings
 import akka.management.scaladsl.ManagementRouteProvider
 
 final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Extension with ManagementRouteProvider {
 
   import ClusterBootstrap.Internal._
-  import system.dispatcher
 
   private val log = Logging(system, classOf[ClusterBootstrap])
 
@@ -59,11 +58,18 @@ final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Exten
       .get
   }
 
-  private[this] val _selfContactPointUri: Promise[Uri] = Promise()
+  private[this] val _selfContactPointUri: Promise[(String, Int)] = settings.contactPoint.probeMethod match {
+    case BootstrapCoordinator.ProbeMethodRemoting =>
+      val self = Cluster(system).selfAddress
+      Promise.successful((self.host.getOrElse(sys.error("No host")), self.port.getOrElse(sys.error("No port"))))
+    case _ => Promise()
+  }
 
   override def routes(routeProviderSettings: ManagementRouteProviderSettings): Route = {
-    log.info(s"Using self contact point address: ${routeProviderSettings.selfBaseUri}")
-    this.setSelfContactPoint(routeProviderSettings.selfBaseUri)
+    if (settings.contactPoint.probeMethod == BootstrapCoordinator.ProbeMethodAkkaManagement) {
+      log.info(s"Using self contact point address: ${routeProviderSettings.selfBaseUri}")
+      this.setSelfContactPoint(routeProviderSettings.selfBaseUri)
+    }
 
     new HttpClusterBootstrapRoutes(settings).routes
   }
@@ -80,6 +86,10 @@ final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Exten
 
       val bootstrapProps = BootstrapCoordinator.props(discovery, joinDecider, settings)
       val bootstrap = system.systemActorOf(bootstrapProps, "bootstrapCoordinator")
+      if (settings.contactPoint.probeMethod == BootstrapCoordinator.ProbeMethodRemoting) {
+        system.systemActorOf(RemotingContactPoint.props(settings), RemotingContactPoint.RemotingContactPointActorName)
+      }
+
       // Bootstrap already logs in several other execution points when it can't form a cluster, and why.
       bootstrap ! BootstrapCoordinator.Protocol.InitiateBootstrapping
     } else log.warning("Bootstrap already initiated, yet start() method was called again. Ignoring.")
@@ -96,13 +106,11 @@ final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Exten
    */
   @InternalApi
   private[akka] def setSelfContactPoint(baseUri: Uri): Unit =
-    _selfContactPointUri.success(baseUri)
+    _selfContactPointUri.success((baseUri.authority.host.toString, baseUri.authority.port))
 
   /** INTERNAL API */
   @InternalApi private[akka] def selfContactPoint: Future[(String, Int)] =
-    _selfContactPointUri.future.map { uri =>
-      (uri.authority.host.toString, uri.authority.port)
-    }
+    _selfContactPointUri.future
 }
 
 object ClusterBootstrap extends ExtensionId[ClusterBootstrap] with ExtensionIdProvider {

diff --git a/...bootstrap/src/main/scala/akka/management/cluster/bootstrap/ClusterBootstrapSettings.scala b/...bootstrap/src/main/scala/akka/management/cluster/bootstrap/ClusterBootstrapSettings.scala
@@ -10,8 +10,10 @@ import java.util.concurrent.TimeUnit
 
 import akka.actor.ActorSystem
 import akka.event.LoggingAdapter
+import akka.management.cluster.bootstrap.internal.BootstrapCoordinator
 import com.typesafe.config.Config
-import scala.concurrent.duration.{ FiniteDuration, _ }
+
+import scala.concurrent.duration.{FiniteDuration, _}
 import scala.compat.java8.OptionConverters._
 import akka.util.JavaDurationConverters._
 
@@ -122,6 +124,10 @@ final class ClusterBootstrapSettings(config: Config, log: LoggingAdapter) {
   object contactPoint {
     private val contactPointConfig = bootConfig.getConfig("contact-point")
 
+    val probeMethod: String = contactPointConfig.getString("probe-method")
+
+    require(BootstrapCoordinator.ValidProbeMethods.contains(probeMethod), "Probe method must be one of: " + BootstrapCoordinator.ValidProbeMethods.mkString(", "))
+
     val fallbackPort: Int =
       contactPointConfig
         .optDefinedValue("fallback-port")

diff --git a/...ain/scala/akka/management/cluster/bootstrap/contactpoint/HttpClusterBootstrapRoutes.scala b/...ain/scala/akka/management/cluster/bootstrap/contactpoint/HttpClusterBootstrapRoutes.scala
@@ -5,46 +5,24 @@
 package akka.management.cluster.bootstrap.contactpoint
 
 import scala.concurrent.duration._
-
 import akka.actor.ActorSystem
-import akka.cluster.Cluster
-import akka.cluster.Member
 import akka.event.Logging
 import akka.event.LoggingAdapter
 import akka.http.javadsl.server.directives.RouteAdapter
 import akka.http.scaladsl.model.HttpRequest
 import akka.http.scaladsl.model.Uri
 import akka.http.scaladsl.server.Route
 import akka.management.cluster.bootstrap.ClusterBootstrapSettings
-import akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol.ClusterMember
-import akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol.SeedNodes
+import akka.management.cluster.bootstrap.internal.ContactPoint
 
 final class HttpClusterBootstrapRoutes(settings: ClusterBootstrapSettings) extends HttpBootstrapJsonProtocol {
 
   import akka.http.scaladsl.server.Directives._
 
   private def routeGetSeedNodes: Route = extractClientIP { clientIp ⇒
     extractActorSystem { implicit system ⇒
-      import akka.cluster.MemberStatus
-      val cluster = Cluster(system)
-
-      def memberToClusterMember(m: Member): ClusterMember =
-        ClusterMember(m.uniqueAddress.address, m.uniqueAddress.longUid, m.status.toString, m.roles)
-
-      val state = cluster.state
-
-      // TODO shuffle the members so in a big deployment nodes start joining different ones and not all the same?
-      val members = state.members
-        .diff(state.unreachable)
-        .filter(
-            m => m.status == MemberStatus.up || m.status == MemberStatus.weaklyUp || m.status == MemberStatus.joining)
-        .take(settings.contactPoint.httpMaxSeedNodesToExpose)
-        .map(memberToClusterMember)
-
-      val info = SeedNodes(cluster.selfMember.uniqueAddress.address, members)
-      log.info("Bootstrap request from {}: Contact Point returning {} seed-nodes ([{}])", clientIp, members.size,
-        members)
-      complete(info)
+      val contactPoint = new ContactPoint(system, settings, log)
+      complete(contactPoint.seedNodes(clientIp.toString))
     }
   }
 

diff --git a/...main/scala/akka/management/cluster/bootstrap/internal/AbstractContactPointBootstrap.scala b/...main/scala/akka/management/cluster/bootstrap/internal/AbstractContactPointBootstrap.scala
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2017-2018 Lightbend Inc. <https://www.lightbend.com>
+ */
+
+package akka.management.cluster.bootstrap.internal
+
+import java.time.LocalDateTime
+import java.util.concurrent.ThreadLocalRandom
+
+import akka.actor.{Actor, ActorLogging, DeadLetterSuppression, Status, Timers}
+import akka.annotation.InternalApi
+import akka.discovery.ServiceDiscovery.ResolvedTarget
+import akka.management.cluster.bootstrap.ClusterBootstrapSettings
+import akka.util.Timeout
+import akka.pattern.pipe
+
+import scala.concurrent.Future
+import scala.concurrent.duration._
+
+@InternalApi
+private[bootstrap] object AbstractContactPointBootstrap {
+
+  private case object ProbeTick extends DeadLetterSuppression
+  private val ProbingTimerKey = "probing-key"
+}
+
+
+/**
+  * Intended to be spawned as child actor by a higher-level Bootstrap coordinator that manages obtaining of the URIs.
+  *
+  * This additional step may at-first seem superficial -- after all, we already have some addresses of the nodes
+  * that we'll want to join -- however it is not optional. By communicating with the actual nodes before joining their
+  * cluster we're able to inquire about their status, double-check if perhaps they are part of an existing cluster already
+  * that we should join, or even coordinate rolling upgrades or more advanced patterns.
+  */
+@InternalApi
+private[bootstrap] abstract class AbstractContactPointBootstrap(
+  settings: ClusterBootstrapSettings,
+  contactPoint: ResolvedTarget
+) extends Actor
+  with ActorLogging
+  with Timers {
+
+  import AbstractContactPointBootstrap.ProbeTick
+  import AbstractContactPointBootstrap.ProbingTimerKey
+  import akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol._
+  import context.dispatcher
+
+  private val probeInterval = settings.contactPoint.probeInterval
+  private implicit val probingFailureTimeout: Timeout = Timeout(settings.contactPoint.probingFailureTimeout)
+
+  /**
+    * If probing keeps failing until the deadline triggers, we notify the parent,
+    * such that it rediscover again.
+    */
+  private var probingKeepFailingDeadline: Deadline = settings.contactPoint.probingFailureTimeout.fromNow
+
+  private def resetProbingKeepFailingWithinDeadline(): Unit =
+    probingKeepFailingDeadline = settings.contactPoint.probingFailureTimeout.fromNow
+
+  override final def preStart(): Unit =
+    self ! ProbeTick
+
+  override final def receive: Receive = {
+    case ProbeTick ⇒
+      log.debug("Probing [{}] for seed nodes...", uri)
+      probe() pipeTo self
+
+    case Status.Failure(cause) =>
+      log.warning("Probing [{}] failed due to: {}", uri, cause.getMessage)
+      if (probingKeepFailingDeadline.isOverdue()) {
+        log.error("Overdue of probing-failure-timeout, stop probing, signaling that it's failed")
+        context.parent ! BootstrapCoordinator.Protocol.ProbingFailed(contactPoint, cause)
+        context.stop(self)
+      } else {
+        // keep probing, hoping the request will eventually succeed
+        scheduleNextContactPointProbing()
+      }
+
+    case response: SeedNodes ⇒
+      notifyParentAboutSeedNodes(response)
+      resetProbingKeepFailingWithinDeadline()
+      // we keep probing and looking if maybe a cluster does form after all
+      // (technically could be long polling or web-sockets, but that would need reconnect logic, so this is simpler)
+      scheduleNextContactPointProbing()
+  }
+
+  /**
+    * Probe the contact point.
+    *
+    * @param probingFailureTimeout A timeout, if not replied within this timeout, the returned Future should fail.
+    * @return A future of the seed nodes.
+    */
+  protected def probe()(implicit probingFailureTimeout: Timeout): Future[SeedNodes]
+
+  /**
+    * Render the URI of the contact point as a string.
+    *
+    * This is used for logging purposes.
+    */
+  protected def uri: String
+
+  private def notifyParentAboutSeedNodes(members: SeedNodes): Unit = {
+    val seedAddresses = members.seedNodes.map(_.node)
+    context.parent ! BootstrapCoordinator.Protocol.ObtainedHttpSeedNodesObservation(timeNow(), contactPoint,
+      members.selfNode, seedAddresses)
+  }
+
+  private def scheduleNextContactPointProbing(): Unit =
+    timers.startSingleTimer(ProbingTimerKey, ProbeTick, effectiveProbeInterval())
+
+  /** Duration with configured jitter applied */
+  private def effectiveProbeInterval(): FiniteDuration =
+    probeInterval + jitter(probeInterval)
+
+  def jitter(d: FiniteDuration): FiniteDuration =
+    (d.toMillis * settings.contactPoint.probeIntervalJitter * ThreadLocalRandom.current().nextDouble()).millis
+
+  protected def timeNow(): LocalDateTime =
+    LocalDateTime.now()
+
+}