Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP - Cluster bootstrap remoting probe method #546

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion cluster-bootstrap/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,14 @@ akka.management {
# Configured how we communicate with the contact point once it is discovered
contact-point {

# The probe method. Valid values are akka-management and remoting. If akka-management, will use akka-management
# HTTP interface to discover seed nodes. If remoting, will use Akka remoting to discover seed nodes.
probe-method = "akka-management"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we want to support both methods?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You won't be able to do a rolling upgrade to the new method if you don't support both concurrently.


# If no port is discovered along with the host/ip of a contact point this port will be used as fallback
# Also, when no port-name is used and multiple results are returned for a given service, this port is
# used to disambiguate. When set to <fallback-port>, defaults to the value of akka.management.http.port
# used to disambiguate. When set to <fallback-port>, defaults to the value of akka.management.http.port.
# Only used by http probe method, for remoting, the cluster remoting port is used.
fallback-port = "<fallback-port>" # port pun, it "complements" 2552 which is often used for Akka remoting

# If some discovered seed node will keep failing to connect for specified period of time,
Expand All @@ -126,3 +131,16 @@ akka.management {
}

}

akka.actor {
serializers {
akka-management-cluster-bootstrap = "akka.management.cluster.bootstrap.internal.BootstrapProtocolSerializer"
}
serialization-bindings {
"akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol$SeedNodes" = akka-management-cluster-bootstrap
"akka.management.cluster.bootstrap.internal.RemotingContactPoint$GetSeedNodes$" = akka-management-cluster-bootstrap
}
serialization-identifiers {
"akka.management.cluster.bootstrap.internal.BootstrapProtocolSerializer" = 8788
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,28 @@ package akka.management.cluster.bootstrap
import java.util.concurrent.atomic.AtomicReference

import akka.AkkaVersion

import scala.concurrent.Future
import scala.concurrent.Promise

import akka.actor.ActorSystem
import akka.actor.ExtendedActorSystem
import akka.actor.Extension
import akka.actor.ExtensionId
import akka.actor.ExtensionIdProvider
import akka.annotation.InternalApi
import akka.cluster.Cluster
import akka.discovery.{ Discovery, ServiceDiscovery }
import akka.discovery.{Discovery, ServiceDiscovery}
import akka.event.Logging
import akka.http.scaladsl.model.Uri
import akka.http.scaladsl.server.Route
import akka.management.cluster.bootstrap.contactpoint.HttpClusterBootstrapRoutes
import akka.management.cluster.bootstrap.internal.BootstrapCoordinator
import akka.management.cluster.bootstrap.internal.{BootstrapCoordinator, RemotingContactPoint}
import akka.management.scaladsl.ManagementRouteProviderSettings
import akka.management.scaladsl.ManagementRouteProvider

final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Extension with ManagementRouteProvider {

import ClusterBootstrap.Internal._
import system.dispatcher

private val log = Logging(system, classOf[ClusterBootstrap])

Expand Down Expand Up @@ -59,11 +58,18 @@ final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Exten
.get
}

private[this] val _selfContactPointUri: Promise[Uri] = Promise()
private[this] val _selfContactPointUri: Promise[(String, Int)] = settings.contactPoint.probeMethod match {
case BootstrapCoordinator.ProbeMethodRemoting =>
val self = Cluster(system).selfAddress
Promise.successful((self.host.getOrElse(sys.error("No host")), self.port.getOrElse(sys.error("No port"))))
case _ => Promise()
}

override def routes(routeProviderSettings: ManagementRouteProviderSettings): Route = {
log.info(s"Using self contact point address: ${routeProviderSettings.selfBaseUri}")
this.setSelfContactPoint(routeProviderSettings.selfBaseUri)
if (settings.contactPoint.probeMethod == BootstrapCoordinator.ProbeMethodAkkaManagement) {
log.info(s"Using self contact point address: ${routeProviderSettings.selfBaseUri}")
this.setSelfContactPoint(routeProviderSettings.selfBaseUri)
}

new HttpClusterBootstrapRoutes(settings).routes
}
Expand All @@ -80,6 +86,10 @@ final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Exten

val bootstrapProps = BootstrapCoordinator.props(discovery, joinDecider, settings)
val bootstrap = system.systemActorOf(bootstrapProps, "bootstrapCoordinator")
if (settings.contactPoint.probeMethod == BootstrapCoordinator.ProbeMethodRemoting) {
system.systemActorOf(RemotingContactPoint.props(settings), RemotingContactPoint.RemotingContactPointActorName)
}

// Bootstrap already logs in several other execution points when it can't form a cluster, and why.
bootstrap ! BootstrapCoordinator.Protocol.InitiateBootstrapping
} else log.warning("Bootstrap already initiated, yet start() method was called again. Ignoring.")
Expand All @@ -96,13 +106,11 @@ final class ClusterBootstrap(implicit system: ExtendedActorSystem) extends Exten
*/
@InternalApi
private[akka] def setSelfContactPoint(baseUri: Uri): Unit =
_selfContactPointUri.success(baseUri)
_selfContactPointUri.success((baseUri.authority.host.toString, baseUri.authority.port))

/** INTERNAL API */
@InternalApi private[akka] def selfContactPoint: Future[(String, Int)] =
_selfContactPointUri.future.map { uri =>
(uri.authority.host.toString, uri.authority.port)
}
_selfContactPointUri.future
}

object ClusterBootstrap extends ExtensionId[ClusterBootstrap] with ExtensionIdProvider {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ import java.util.concurrent.TimeUnit

import akka.actor.ActorSystem
import akka.event.LoggingAdapter
import akka.management.cluster.bootstrap.internal.BootstrapCoordinator
import com.typesafe.config.Config
import scala.concurrent.duration.{ FiniteDuration, _ }

import scala.concurrent.duration.{FiniteDuration, _}
import scala.compat.java8.OptionConverters._
import akka.util.JavaDurationConverters._

Expand Down Expand Up @@ -122,6 +124,10 @@ final class ClusterBootstrapSettings(config: Config, log: LoggingAdapter) {
object contactPoint {
private val contactPointConfig = bootConfig.getConfig("contact-point")

val probeMethod: String = contactPointConfig.getString("probe-method")

require(BootstrapCoordinator.ValidProbeMethods.contains(probeMethod), "Probe method must be one of: " + BootstrapCoordinator.ValidProbeMethods.mkString(", "))

val fallbackPort: Int =
contactPointConfig
.optDefinedValue("fallback-port")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,24 @@
package akka.management.cluster.bootstrap.contactpoint

import scala.concurrent.duration._

import akka.actor.ActorSystem
import akka.cluster.Cluster
import akka.cluster.Member
import akka.event.Logging
import akka.event.LoggingAdapter
import akka.http.javadsl.server.directives.RouteAdapter
import akka.http.scaladsl.model.HttpRequest
import akka.http.scaladsl.model.Uri
import akka.http.scaladsl.server.Route
import akka.management.cluster.bootstrap.ClusterBootstrapSettings
import akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol.ClusterMember
import akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol.SeedNodes
import akka.management.cluster.bootstrap.internal.ContactPoint

final class HttpClusterBootstrapRoutes(settings: ClusterBootstrapSettings) extends HttpBootstrapJsonProtocol {

import akka.http.scaladsl.server.Directives._

private def routeGetSeedNodes: Route = extractClientIP { clientIp ⇒
extractActorSystem { implicit system ⇒
import akka.cluster.MemberStatus
val cluster = Cluster(system)

def memberToClusterMember(m: Member): ClusterMember =
ClusterMember(m.uniqueAddress.address, m.uniqueAddress.longUid, m.status.toString, m.roles)

val state = cluster.state

// TODO shuffle the members so in a big deployment nodes start joining different ones and not all the same?
val members = state.members
.diff(state.unreachable)
.filter(
m => m.status == MemberStatus.up || m.status == MemberStatus.weaklyUp || m.status == MemberStatus.joining)
.take(settings.contactPoint.httpMaxSeedNodesToExpose)
.map(memberToClusterMember)

val info = SeedNodes(cluster.selfMember.uniqueAddress.address, members)
log.info("Bootstrap request from {}: Contact Point returning {} seed-nodes ([{}])", clientIp, members.size,
members)
complete(info)
val contactPoint = new ContactPoint(system, settings, log)
complete(contactPoint.seedNodes(clientIp.toString))
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright (C) 2017-2018 Lightbend Inc. <https://www.lightbend.com>
*/

package akka.management.cluster.bootstrap.internal

import java.time.LocalDateTime
import java.util.concurrent.ThreadLocalRandom

import akka.actor.{Actor, ActorLogging, DeadLetterSuppression, Status, Timers}
import akka.annotation.InternalApi
import akka.discovery.ServiceDiscovery.ResolvedTarget
import akka.management.cluster.bootstrap.ClusterBootstrapSettings
import akka.util.Timeout
import akka.pattern.pipe

import scala.concurrent.Future
import scala.concurrent.duration._

@InternalApi
private[bootstrap] object AbstractContactPointBootstrap {

private case object ProbeTick extends DeadLetterSuppression
private val ProbingTimerKey = "probing-key"
}


/**
* Intended to be spawned as child actor by a higher-level Bootstrap coordinator that manages obtaining of the URIs.
*
* This additional step may at-first seem superficial -- after all, we already have some addresses of the nodes
* that we'll want to join -- however it is not optional. By communicating with the actual nodes before joining their
* cluster we're able to inquire about their status, double-check if perhaps they are part of an existing cluster already
* that we should join, or even coordinate rolling upgrades or more advanced patterns.
*/
@InternalApi
private[bootstrap] abstract class AbstractContactPointBootstrap(
settings: ClusterBootstrapSettings,
contactPoint: ResolvedTarget
) extends Actor
with ActorLogging
with Timers {

import AbstractContactPointBootstrap.ProbeTick
import AbstractContactPointBootstrap.ProbingTimerKey
import akka.management.cluster.bootstrap.contactpoint.HttpBootstrapJsonProtocol._
import context.dispatcher

private val probeInterval = settings.contactPoint.probeInterval
private implicit val probingFailureTimeout: Timeout = Timeout(settings.contactPoint.probingFailureTimeout)

/**
* If probing keeps failing until the deadline triggers, we notify the parent,
* such that it rediscover again.
*/
private var probingKeepFailingDeadline: Deadline = settings.contactPoint.probingFailureTimeout.fromNow

private def resetProbingKeepFailingWithinDeadline(): Unit =
probingKeepFailingDeadline = settings.contactPoint.probingFailureTimeout.fromNow

override final def preStart(): Unit =
self ! ProbeTick

override final def receive: Receive = {
case ProbeTick ⇒
log.debug("Probing [{}] for seed nodes...", uri)
probe() pipeTo self

case Status.Failure(cause) =>
log.warning("Probing [{}] failed due to: {}", uri, cause.getMessage)
if (probingKeepFailingDeadline.isOverdue()) {
log.error("Overdue of probing-failure-timeout, stop probing, signaling that it's failed")
context.parent ! BootstrapCoordinator.Protocol.ProbingFailed(contactPoint, cause)
context.stop(self)
} else {
// keep probing, hoping the request will eventually succeed
scheduleNextContactPointProbing()
}

case response: SeedNodes ⇒
notifyParentAboutSeedNodes(response)
resetProbingKeepFailingWithinDeadline()
// we keep probing and looking if maybe a cluster does form after all
// (technically could be long polling or web-sockets, but that would need reconnect logic, so this is simpler)
scheduleNextContactPointProbing()
}

/**
* Probe the contact point.
*
* @param probingFailureTimeout A timeout, if not replied within this timeout, the returned Future should fail.
* @return A future of the seed nodes.
*/
protected def probe()(implicit probingFailureTimeout: Timeout): Future[SeedNodes]

/**
* Render the URI of the contact point as a string.
*
* This is used for logging purposes.
*/
protected def uri: String

private def notifyParentAboutSeedNodes(members: SeedNodes): Unit = {
val seedAddresses = members.seedNodes.map(_.node)
context.parent ! BootstrapCoordinator.Protocol.ObtainedHttpSeedNodesObservation(timeNow(), contactPoint,
members.selfNode, seedAddresses)
}

private def scheduleNextContactPointProbing(): Unit =
timers.startSingleTimer(ProbingTimerKey, ProbeTick, effectiveProbeInterval())

/** Duration with configured jitter applied */
private def effectiveProbeInterval(): FiniteDuration =
probeInterval + jitter(probeInterval)

def jitter(d: FiniteDuration): FiniteDuration =
(d.toMillis * settings.contactPoint.probeIntervalJitter * ThreadLocalRandom.current().nextDouble()).millis

protected def timeNow(): LocalDateTime =
LocalDateTime.now()

}
Loading