diff --git a/.github/workflows/build-test-dev.yml b/.github/workflows/build-test-dev.yml index 0812ed3..bf8b94b 100644 --- a/.github/workflows/build-test-dev.yml +++ b/.github/workflows/build-test-dev.yml @@ -39,7 +39,7 @@ jobs: - name: Upload Test coverage Reports if: ${{ always() }} - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: code-coverage-report path: | diff --git a/.github/workflows/prep-release.yml b/.github/workflows/prep-release.yml index 7a154d8..6083856 100644 --- a/.github/workflows/prep-release.yml +++ b/.github/workflows/prep-release.yml @@ -4,7 +4,7 @@ on: release_tag: description: 'Release tag' required: true - default: '1.0.0-dev' + default: '1.0.1-dev' prep_internal_release: # Need to distinguish between internal and external releases # Internal release: Will use default internal location for created images (ghcr.io) and will tag and push operator candidate there diff --git a/Dockerfile b/Dockerfile index 9a2a284..870d386 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,8 +23,8 @@ FROM registry.access.redhat.com/ubi8/ubi-minimal:latest LABEL name="solace/pubsubplus-eventbroker-operator" LABEL vendor="Solace Corporation" -LABEL version="1.0.0" -LABEL release="1.0.0" +LABEL version="1.0.1" +LABEL release="1.0.1" LABEL summary="Solace PubSub+ Event Broker Kubernetes Operator" LABEL description="The Solace PubSub+ Event Broker Kubernetes Operator deploys and manages the lifecycle of PubSub+ Event Brokers" diff --git a/Makefile b/Makefile index 495bd91..4642dd3 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ # To re-generate a bundle for another specific version without changing the standard setup, you can: # - use the VERSION as arg of the bundle target (e.g make bundle VERSION=0.0.2) # - use environment variables to overwrite this value (e.g export VERSION=0.0.2) -VERSION ?= 1.0.0-dev +VERSION ?= 1.0.1-dev # API_VERSION defines the API version for the PubSubPlusEventBroker CRD API_VERSION ?= v1beta1 diff --git a/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml b/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml index 4cbd579..356145d 100644 --- a/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml +++ b/bundle/manifests/pubsubplus-eventbroker-operator.clusterserviceversion.yaml @@ -20,8 +20,8 @@ metadata: certified: "true" com.redhat.delivery.operator.bundle: "true" com.redhat.openshift.versions: v4.10 - containerImage: docker.io/solace/pubsubplus-eventbroker-operator:1.0.0 - createdAt: "2023-04-19T16:00:24Z" + containerImage: docker.io/solace/pubsubplus-eventbroker-operator:1.0.1 + createdAt: "2023-09-13T10:40:30Z" description: The Solace PubSub+ Event Broker Operator deploys and manages the lifecycle of PubSub+ Event Brokers operators.openshift.io/valid-subscription: '[]' @@ -29,7 +29,7 @@ metadata: operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/SolaceProducts/pubsubplus-kubernetes-quickstart support: Solace Products - name: pubsubplus-eventbroker-operator.v1.0.0 + name: pubsubplus-eventbroker-operator.v1.0.1 namespace: placeholder spec: apiservicedefinitions: {} @@ -296,7 +296,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.annotations['olm.targetNamespaces'] - image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.0 + image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.1 imagePullPolicy: Always livenessProbe: httpGet: @@ -411,4 +411,4 @@ spec: provider: name: Solace Corporation url: www.solace.com - version: 1.0.0 + version: 1.0.1 diff --git a/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml b/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml index 9cacbb3..a7b5141 100644 --- a/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml +++ b/bundle/manifests/pubsubplus.solace.com_pubsubpluseventbrokers.yaml @@ -4,7 +4,7 @@ metadata: annotations: controller-gen.kubebuilder.io/version: v0.11.3 labels: - app.kubernetes.io/version: v1.0.0 + app.kubernetes.io/version: v1.0.1 name: pubsubpluseventbrokers.pubsubplus.solace.com spec: group: pubsubplus.solace.com diff --git a/bundle/metadata/annotations.yaml b/bundle/metadata/annotations.yaml index d4ab5aa..fb59147 100644 --- a/bundle/metadata/annotations.yaml +++ b/bundle/metadata/annotations.yaml @@ -15,4 +15,4 @@ annotations: operators.operatorframework.io.test.config.v1: tests/scorecard/ # Required by RedHat certification - com.redhat.openshift.versions: "v4.10" + com.redhat.openshift.versions: "v4.11" diff --git a/ci/whitesource/whitesource-agent.config b/ci/whitesource/whitesource-agent.config index 7945775..1a776b4 100644 --- a/ci/whitesource/whitesource-agent.config +++ b/ci/whitesource/whitesource-agent.config @@ -45,7 +45,7 @@ projectVersion= projectToken= productName=pubsubplus-kubernetes-operator -productVersion=v1.0.0 +productVersion=v1.0.1 productToken= updateType=OVERRIDE #requesterEmail=user@provider.com diff --git a/controllers/brokerscripts/init.sh b/controllers/brokerscripts/init.sh index 96f3256..36e7d09 100644 --- a/controllers/brokerscripts/init.sh +++ b/controllers/brokerscripts/init.sh @@ -15,15 +15,15 @@ if [ "${BROKER_TLS_ENABLED}" = "true" ]; then cat /mnt/disks/certs/server/${BROKER_CERT_FILENAME} /mnt/disks/certs/server/${BROKER_CERTKEY_FILENAME} > /dev/shm/server.cert export tls_servercertificate_filepath="/dev/shm/server.cert" fi +# Deal with the fact we cannot accept "-" in router names +export routername=$(echo $(hostname) | sed 's/-//g') if [ "${BROKER_REDUNDANCY}" = "true" ]; then IFS='-' read -ra host_array <<< $(hostname) is_monitor=$([ ${host_array[-2]} = "m" ] && echo 1 || echo 0) is_backup=$([ ${host_array[-2]} = "b" ] && echo 1 || echo 0) namespace=$(echo $STATEFULSET_NAMESPACE) service=${BROKERSERVICES_NAME} - # Deal with the fact we cannot accept "-" in broker names service_name=$(echo ${service} | sed 's/-//g') - export routername=$(echo $(hostname) | sed 's/-//g') export redundancy_enable=yes export configsync_enable=yes export redundancy_authentication_presharedkey_key=$(cat /mnt/disks/secrets/presharedauthkey/preshared_auth_key | base64) @@ -37,18 +37,18 @@ if [ "${BROKER_REDUNDANCY}" = "true" ]; then # Non Monitor Nodes if [ "${is_monitor}" = "0" ]; then - case ${is_backup} in - 0) - export nodetype=message_routing - export redundancy_matelink_connectvia=${service}-b-0.${service}-discovery.${namespace}.svc - export redundancy_activestandbyrole=primary - ;; - 1) - export nodetype=message_routing - export redundancy_matelink_connectvia=${service}-p-0.${service}-discovery.${namespace}.svc - export redundancy_activestandbyrole=backup - ;; - esac + case ${is_backup} in + 0) + export nodetype=message_routing + export redundancy_matelink_connectvia=${service}-b-0.${service}-discovery.${namespace}.svc + export redundancy_activestandbyrole=primary + ;; + 1) + export nodetype=message_routing + export redundancy_matelink_connectvia=${service}-p-0.${service}-discovery.${namespace}.svc + export redundancy_activestandbyrole=backup + ;; + esac else export nodetype=monitoring fi diff --git a/controllers/brokerscripts/readiness_check.sh b/controllers/brokerscripts/readiness_check.sh index f80ecf7..6e4827c 100644 --- a/controllers/brokerscripts/readiness_check.sh +++ b/controllers/brokerscripts/readiness_check.sh @@ -19,28 +19,39 @@ set_label () { #Prevent overdriving Kubernetes infra, don't set activity state to same as previous state previous_state=$(get_label "active") if [ "${2}" = "${previous_state}" ]; then - #echo "$(date) INFO: ${APP}-Current and Previous state match (${2}), not updating pod label" - : + #echo "$(date) INFO: ${APP}-Current and Previous state match (${2}), not updating pod label" + : else - echo "$(date) INFO: ${APP}-Updating pod label using K8s API from ${previous_state} to ${2}" - echo "[{\"op\": \"add\", \"path\": \"/metadata/labels/${1}\", \"value\": \"${2}\" }]" > /tmp/patch_label.json - K8S=https://kubernetes.default.svc.cluster.local:$KUBERNETES_SERVICE_PORT - KUBE_TOKEN=$(&2 - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + echo "$(date) INFO: ${APP}-Updating pod label using K8s API from ${previous_state} to ${2}" + echo "[{\"op\": \"add\", \"path\": \"/metadata/labels/${1}\", \"value\": \"${2}\" }]" > /tmp/patch_label.json + K8S=https://kubernetes.default.svc.cluster.local:$KUBERNETES_SERVICE_PORT + KUBE_TOKEN=$(&2 + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + fi fi fi - fi +} + + +# Function to get remote sync state +get_router_remote_config_state() { + # Params: $1 is property of config to return for router + routerresults=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/config-sync/database/remote/tables/table[1]/source-router/${1}") + routerremotesync_result=$(echo ${routerresults} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + echo $routerremotesync_result } # Main logic: note that there are no re-tries here, if check fails then return not ready. @@ -50,157 +61,159 @@ if [ "${BROKER_REDUNDANCY}" = "true" ]; then is_monitor=$([ ${host_array[-2]} = "m" ] && echo 1 || echo 0) is_backup=$([ ${host_array[-2]} = "b" ] && echo 1 || echo 0) password=$(cat /mnt/disks/secrets/admin/username_admin_password) - # For update (includes SolOS upgrade) purposes, additional checks are required for readiness state when the pod has been started - # This is an update if the LASTVERSION_FILE with K8s controller-revision-hash exists and contents differ from current value - LASTVERSION_FILE=/var/lib/solace/var/lastConfigRevisionBeforeReboot - if [ ! -f ${LASTVERSION_FILE} ] || [[ $(cat ${LASTVERSION_FILE}) != $(get_label "controller-revision-hash") ]] ; then - echo "$(date) INFO: ${APP}-Initial startup or Upgrade detected, running additional checks..." - # Check redundancy - echo "$(date) INFO: ${APP}-Running checks. Redundancy state check started..." - results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/redundancy-status") - redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - if [ "${redundancystatus_results}" != "Up" ]; then - echo "$(date) INFO: ${APP}-Redundancy state is not yet up." - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - fi - # Additionally check config-sync status for non-monitoring nodes - echo "$(date) INFO: ${APP}-Running checks. Config-sync state check started..." - if [ "${is_monitor}" = "0" ]; then - results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/config-sync/status/oper-status") - confsyncstatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - if [ "${confsyncstatus_results}" != "Up" ]; then - echo "$(date) INFO: ${APP}-Config-sync state is not yet up." - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - fi - fi - fi - # Record current version in LASTVERSION_FILE - echo $(get_label "controller-revision-hash") > ${LASTVERSION_FILE} # For monitor node just check for redundancy; active label will never be set if [ "${is_monitor}" = "1" ]; then - # Check redundancy - echo "$(date) INFO: ${APP}-Running checks. Redundancy state check started..." - results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/redundancy-status") - redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - if [ "${redundancystatus_results}" != "Up" ]; then - echo "$(date) INFO: ${APP}-Redundancy state is not yet up." - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - fi - if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then - echo "$(date) INFO: ${APP}-All nodes online, monitor node is redundancy ready" - touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} - echo "$(date) INFO: ${APP}-Server status check complete for this broker node" - exit 1 - fi - exit 0 + # Check redundancy + results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/redundancy/redundancy-status") + redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "${redundancystatus_results}" != "Up" ]; then + echo "$(date) INFO: ${APP}-Waiting for redundancy up, redundancy state is not yet up." + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + fi + if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then + echo "$(date) INFO: ${APP}-All nodes online, monitor node is redundancy ready" + touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} + echo "$(date) INFO: ${APP}-Server status check complete for this broker node" + exit 1 + fi + exit 0 fi # End Monitor Node + # From here only message routing nodes. # For Primary or Backup nodes set both service readiness (active label) and k8s readiness (exit return value) health_result=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:5550/health-check/guaranteed-active) case "${health_result}" in - "200") - if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then - echo "$(date) INFO: ${APP}-HA Event Broker health check reported 200, message spool is up" - touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} - echo "$(date) INFO: ${APP}-Server status check complete for this broker node" - echo "$(date) INFO: ${APP}-Changing pod label to active" - #exit 1 Removing as this may delay activity switch by 5 seconds - fi - set_label "active" "true" - exit 0 - ;; - "503") - if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-HA Event Broker health check reported 503"; fi - set_label "active" "false" - # Further check is required to determine readiness - ;; - *) - echo "$(date) WARN: ${APP}-HA Event Broker health check reported unexpected ${health_result}" - set_label "active" "false" - echo "$(date) INFO: ${APP}-Changing pod label to inactive" - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + "200") + if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then + echo "$(date) INFO: ${APP}-HA Event Broker health check reported 200, message spool is up" + touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} + echo "$(date) INFO: ${APP}-Server status check complete for this broker node" + echo "$(date) INFO: ${APP}-Changing pod label to active" + #exit 1 Removing as this may delay activity switch by 5 seconds + fi + set_label "active" "true" + exit 0 + ;; + "503") + if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-HA Event Broker health check reported 503"; fi + set_label "active" "false" + # Further check is required to determine readiness + ;; + *) + echo "$(date) WARN: ${APP}-HA Event Broker health check reported unexpected ${health_result}" + set_label "active" "false" + echo "$(date) INFO: ${APP}-Changing pod label to inactive" + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 esac # At this point analyzing readiness after health check returned 503 - checking if Event Broker is Standby case "${is_backup}" in - "0") - config_role="primary" - ;; - "1") - config_role="backup" - ;; + "0") + config_role="primary" + ;; + "1") + config_role="backup" + ;; esac online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${config_role}/status/activity[text()]") + -q "" \ + -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${config_role}/status/activity[text()]") local_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) case "${local_activity}" in - "Mate Active") - # Check redundancy - results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/redundancy-status") - redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - if [ "${redundancystatus_results}" != "Up" ]; then - echo "$(date) INFO: ${APP}-Running checks.Redundancy state is not yet up." - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - fi - # Additionally check config-sync status for non-monitoring nodes - if [ "${node_ordinal}" != "2" ]; then - results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/config-sync/status/oper-status") - confsyncstatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - if [ "${confsyncstatus_results}" != "Up" ]; then - echo "$(date) INFO: ${APP}-Running checks. Config-sync state is not yet up." + "Mate Active") + # Check redundancy + results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/redundancy/redundancy-status") + redundancystatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "${redundancystatus_results}" != "Up" ]; then + echo "$(date) INFO: ${APP}-Running checks.Redundancy state is not yet up." + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + fi + # Check config-sync status + results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/config-sync/status/oper-status") + confsyncstatus_results=$(echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "${confsyncstatus_results}" != "Up" ]; then + + # Additional check to confirm config-sync + echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..." + + messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)") + messagevpn_total=$(echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + + # Count message_vpns in-sync and compare with total + localmessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(//table[sync-state='In-Sync'])") + local_messagevpn_total_insync=$(echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then + echo "$(date) INFO: ${APP}-Config-sync state is not in-sync locally." + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + fi + + echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..." + vpnremotehamate_result=$(get_router_remote_config_state "name") + + remote_messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(//table/source-router[name='$vpnremotehamate_result'])") + remote_messagevpn_total=$(echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + + #Count message_vpns in-sync, not stale and compare with total + remotemessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])") + remote_messagevpn_total_insync=$(echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then + echo "$(date) INFO: ${APP}-Config-sync state is not in-sync for remote." + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + fi + fi + # Pass readiness check + if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then + echo "$(date) INFO: ${APP}-Redundancy is up and node is Mate Active" + touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} + echo "$(date) INFO: ${APP}-Server status check complete for this broker node" + exit 1 + fi + exit 0 + ;; + *) + echo "$(date) WARN: ${APP}-Health check returned 503 and local activity state is: ${local_activity}, failing readiness check." rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - fi - fi - # Pass readiness check - if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then - echo "$(date) INFO: ${APP}-Redundancy is up and node is mate Active" - touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} - echo "$(date) INFO: ${APP}-Server status check complete for this broker node" - exit 1 - fi - exit 0 - ;; - *) - echo "$(date) WARN: ${APP}-Health check returned 503 and local activity state is: ${local_activity}, failing readiness check." - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - ;; + ;; esac else # nonHA config health_result=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:5550/health-check/guaranteed-active) case "${health_result}" in - "200") - if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then - echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 200, message spool is up" - touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} - echo "$(date) INFO: ${APP}-Server status check complete for this broker node" - echo "$(date) INFO: ${APP}-Changing pod label to active" - exit 1 - fi - set_label "active" "true" - exit 0 - ;; - "503") - if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 503, message spool is down"; fi - set_label "active" "false" - echo "$(date) INFO: ${APP}-Changing pod label to inactive" - # Fail readiness check - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 - ;; - *) - echo "$(date) WARN: ${APP}-nonHA Event Broker health check reported ${health_result}" - set_label "active" "false" - echo "$(date) INFO: ${APP}-Changing pod label to inactive" - # Fail readiness check - rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + "200") + if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then + echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 200, message spool is up" + touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} + echo "$(date) INFO: ${APP}-Server status check complete for this broker node" + echo "$(date) INFO: ${APP}-Changing pod label to active" + exit 1 + fi + set_label "active" "true" + exit 0 + ;; + "503") + if [[ $(get_label "active") = "true" ]]; then echo "$(date) INFO: ${APP}-nonHA Event Broker health check reported 503, message spool is down"; fi + set_label "active" "false" + echo "$(date) INFO: ${APP}-Changing pod label to inactive" + # Fail readiness check + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 + ;; + *) + echo "$(date) WARN: ${APP}-nonHA Event Broker health check reported ${health_result}" + set_label "active" "false" + echo "$(date) INFO: ${APP}-Changing pod label to inactive" + # Fail readiness check + rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1 esac fi \ No newline at end of file diff --git a/controllers/brokerscripts/semp_query.sh b/controllers/brokerscripts/semp_query.sh index 60ca37e..7b8887c 100644 --- a/controllers/brokerscripts/semp_query.sh +++ b/controllers/brokerscripts/semp_query.sh @@ -39,7 +39,7 @@ if [[ ${url} = "" || ${username} = "" || ${password} = "" ]]; then echo 'missing parameter' exit 1 fi -if [ "$(curl --write-out '%{http_code}' --silent --output /dev/null -u ${username}:${password} ${url}/SEMP)" != "200" ] ; then +if [ "$(curl --write-out '%{http_code}' --silent --output /dev/null -u ${username}:${password} ${url}/SEMP -d '')" != "200" ] ; then echo "management host is not responding" exit 1 fi diff --git a/controllers/brokerscripts/startup-broker.sh b/controllers/brokerscripts/startup-broker.sh index 891aa74..f9c5cbd 100644 --- a/controllers/brokerscripts/startup-broker.sh +++ b/controllers/brokerscripts/startup-broker.sh @@ -7,12 +7,13 @@ echo "$(date) INFO: ${APP}-PubSub+ broker node starting. HA flags: HA_configured echo "$(date) INFO: ${APP}-Waiting for management API to become available" password=$(cat /mnt/disks/secrets/admin/username_admin_password) INITIAL_STARTUP_FILE=/var/lib/solace/var/k8s_initial_startup_marker -loop_guard=120 +loop_guard=60 pause=10 count=0 -while [ ${count} -lt ${loop_guard} ]; do +# Wait for Solace Management API +while [ ${count} -lt ${loop_guard} ]; do if /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 -t ; then - break + break fi run_time=$((${count} * ${pause})) ((count++)) @@ -26,158 +27,205 @@ fi if [ "${BROKER_TLS_ENABLED}" = "true" ]; then rm /dev/shm/server.cert # remove as soon as possible cert_results=$(curl --write-out '%{http_code}' --silent --output /dev/null -k -X PATCH -u admin:${password} https://localhost:1943/SEMP/v2/config/ \ - -H "content-type: application/json" \ - -d "{\"tlsServerCertContent\":\"$(cat /mnt/disks/certs/server/${BROKER_CERT_FILENAME} /mnt/disks/certs/server/${BROKER_CERTKEY_FILENAME} | awk '{printf "%s\\n", $0}')\"}") + -H "content-type: application/json" \ + -d "{\"tlsServerCertContent\":\"$(cat /mnt/disks/certs/server/${BROKER_CERT_FILENAME} /mnt/disks/certs/server/${BROKER_CERTKEY_FILENAME} | awk '{printf "%s\\n", $0}')\"}") if [ "${cert_results}" != "200" ]; then - echo "$(date) ERROR: ${APP}-Unable to set the server certificate, exiting" >&2 - exit 1 + echo "$(date) ERROR: ${APP}-Unable to set the server certificate, exiting" >&2 + exit 1 fi echo "$(date) INFO: ${APP}-Server certificate has been configured" - # Future improvement: enable CA configuration from secret ca.crt fi if [ "${BROKER_REDUNDANCY}" = "true" ]; then + # Function to get remote sync state + get_router_remote_config_state() { + # Params: $1 is property of config to return for router + routerresults=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/config-sync/database/remote/tables/table[1]/source-router/${1}") + routerremotesync_result=$(echo ${routerresults} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + echo $routerremotesync_result + } # for non-monitor nodes setup redundancy and config-sync if [ "${is_monitor}" = "0" ]; then - resync_step_required="" - role="" - count=0 - while [ ${count} -lt ${loop_guard} ]; do - role_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/active-standby-role[text()]") - run_time=$((${count} * ${pause})) - case "$(echo ${role_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)" in - "Primary") - role="primary" - break - ;; - "Backup") - role="backup" - break - ;; - esac - ((count++)) - echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's active-standby role" - sleep ${pause} - done - if [ ${count} -eq ${loop_guard} ]; then - echo "$(date) ERROR: ${APP}-Could not determine this node's active-standby role" >&2 - exit 1 - fi - # Determine local activity - count=0 - echo "$(date) INFO: ${APP}-Management API is up, determined that this node's active-standby role is: ${role}" - while [ ${count} -lt ${loop_guard} ]; do - online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/activity[text()]") - local_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - run_time=$((${count} * ${pause})) - case "${local_activity}" in - "Local Active") - echo "$(date) INFO: ${APP}-Node activity status is Local Active, after ${run_time} seconds" - # We should only be here on new cluster create, if not this is an indication of unexpected HA procedures - if [[ ! -e ${INITIAL_STARTUP_FILE} ]]; then - # Need to issue assert master to get back into sync only one time when the PubSub+ Event Broker starts the first time - echo "$(date) INFO: ${APP}-Broker initial startup detected. This node will assert config-sync configuration over its mate" - resync_step_required="true" - else - echo "$(date) WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Normally expected nodes are Mate Active after restart" - fi + resync_step_required="" + role="" + count=0 + # Determine node's primary or backup role + while [ ${count} -lt ${loop_guard} ]; do + role_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/redundancy/active-standby-role[text()]") + run_time=$((${count} * ${pause})) + case "$(echo ${role_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -)" in + "Primary") + role="primary" break ;; - "Mate Active") - echo "$(date) INFO: ${APP}-Node activity status is Mate Active, after ${run_time} seconds" + "Backup") + role="backup" break ;; - esac - ((count++)) - echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Local activity state is: ${local_activity}" - sleep ${pause} - done - if [ ${count} -eq ${loop_guard} ]; then - echo "$(date) ERROR: ${APP}-Local activity state never become Local Active or Mate Active" >&2 - exit 1 - fi - # If we need to assert master, then we need to wait for mate to reconcile - if [ "${resync_step_required}" = "true" ]; then + esac + ((count++)) + echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's primary or backup role" + sleep ${pause} + done + if [ ${count} -eq ${loop_guard} ]; then + echo "$(date) ERROR: ${APP}-Could not determine this node's primary or backup role" >&2 + exit 1 + fi + echo "$(date) INFO: ${APP}-Management API is up, determined that this node's role is: ${role}" + # Determine activity (local or mate active) count=0 - echo "$(date) INFO: ${APP}-Waiting for mate activity state to be 'Standby'" while [ ${count} -lt ${loop_guard} ]; do - online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/detail/priority-reported-by-mate/summary[text()]") - mate_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - run_time=$((${count} * ${pause})) - case "${mate_activity}" in - "Standby") - echo "$(date) INFO: ${APP}-Activity state reported by mate is Standby, after ${run_time} seconds" - break - ;; - esac - ((count++)) - echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Mate activity state is: ${mate_activity}, not yet in sync" - sleep ${pause} + online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/activity[text()]") + local_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + run_time=$((${count} * ${pause})) + case "${local_activity}" in + "Local Active") + echo "$(date) INFO: ${APP}-Node activity status is Local Active, after ${run_time} seconds" + # We should only be here on new cluster create, if not this is an indication of unexpected HA procedures + if [[ ! -e ${INITIAL_STARTUP_FILE} ]]; then + # Need to issue assert master to get back into sync only one time when the PubSub+ Event Broker starts the first time + echo "$(date) INFO: ${APP}-Broker initial startup detected. This node will assert config-sync configuration over its mate" + resync_step_required="true" + else + echo "$(date) WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Possibly a redeploy?" + fi + break + ;; + "Mate Active") + echo "$(date) INFO: ${APP}-Node activity status is Mate Active, after ${run_time} seconds" + break + ;; + esac + ((count++)) + echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, node activity state is: ${local_activity}" + sleep ${pause} done if [ ${count} -eq ${loop_guard} ]; then - echo "$(date) ERROR: ${APP}-Mate not in sync, never reached Standby" >&2 - exit 1 + echo "$(date) ERROR: ${APP}-Node activity state never become Local Active or Mate Active" >&2 + exit 1 fi - fi # if assert-master - # Ensure Config-sync connection state is Connected before proceeding - count=0 - echo "$(date) INFO: ${APP}-Waiting for config-sync connected" - while [ ${count} -lt ${loop_guard} ]; do - online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" \ - -v "/rpc-reply/rpc/show/config-sync/status/client/connection-state") - connection_state=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - run_time=$((${count} * ${pause})) - case "${connection_state}" in - "Connected") - echo "$(date) INFO: ${APP}-Config-sync connection state is Connected, after ${run_time} seconds" - break - ;; - esac - ((count++)) - echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync connection state is: ${connection_state}, not yet in Connected" - sleep ${pause} - done - if [ ${count} -eq ${loop_guard} ]; then - echo "$(date) ERROR: ${APP}-Config-sync connection state never reached Connected" >&2 - exit 1 - fi - # Now can issue assert-master command - if [ "${resync_step_required}" = "true" ]; then - echo "$(date) INFO: ${APP}-Initiating assert-master" - /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "" - /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ - -q "*" - fi - # Wait for config-sync results - count=0 - echo "$(date) INFO: ${APP}-Waiting for config-sync results" - while [ ${count} -lt ${loop_guard} ]; do - online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + # If we need to assert leader, then first wait for mate to report Standby state + if [ "${resync_step_required}" = "true" ]; then + # This branch is AD-active only + count=0 + echo "$(date) INFO: ${APP}-Waiting for mate activity state to be 'Standby'" + while [ ${count} -lt ${loop_guard} ]; do + online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/detail/priority-reported-by-mate/summary[text()]") + mate_activity=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + run_time=$((${count} * ${pause})) + case "${mate_activity}" in + "Standby") + echo "$(date) INFO: ${APP}-Activity state reported by mate is Standby, after ${run_time} seconds" + break + ;; + esac + ((count++)) + echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Mate activity state is: ${mate_activity}, not yet in sync" + sleep ${pause} + done + if [ ${count} -eq ${loop_guard} ]; then + echo "$(date) ERROR: ${APP}-Mate not in sync, never reached Standby" >&2 + exit 1 + fi + fi # if assert-leader + # Ensure Config-sync connection state is Connected for both primary and backup before proceeding + count=0 + echo "$(date) INFO: ${APP}-Waiting for config-sync connected" + while [ ${count} -lt ${loop_guard} ]; do + online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ -q "" \ - -v "/rpc-reply/rpc/show/config-sync/status/oper-status") - confsyncstatus_results=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) - run_time=$((${count} * ${pause})) - case "${confsyncstatus_results}" in - "Up") - echo "$(date) INFO: ${APP}-Config-sync is Up, after ${run_time} seconds" + -v "/rpc-reply/rpc/show/config-sync/status/client/connection-state") + connection_state=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + run_time=$((${count} * ${pause})) + case "${connection_state}" in + "Connected") + echo "$(date) INFO: ${APP}-Config-sync connection state is Connected, after ${run_time} seconds" + break + ;; + esac + ((count++)) + echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync connection state is: ${connection_state}, not yet in Connected" + sleep ${pause} + done + if [ ${count} -eq ${loop_guard} ]; then + echo "$(date) ERROR: ${APP}-Config-sync connection state never reached Connected" >&2 + exit 1 + fi + # Now can issue assert-leader command + if [ "${resync_step_required}" = "true" ]; then + # This branch is AD-active only + echo "$(date) INFO: ${APP}-Initiating assert-leader" + /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" + /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "*" + fi + # Wait for config-sync results + count=0 + echo "$(date) INFO: ${APP}-Waiting for config-sync results" + while [ ${count} -lt ${loop_guard} ]; do + online_results=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "/rpc-reply/rpc/show/config-sync/status/oper-status") + confsyncstatus_results=$(echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + run_time=$((${count} * ${pause})) + case "${confsyncstatus_results}" in + "Up") + echo "$(date) INFO: ${APP}-Config-sync is Up, after ${run_time} seconds" + break + ;; + esac + ((count++)) + echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync is: ${confsyncstatus_results}, not yet Up" + # Additional checks to confirm config-sync (even if reported gloabally as not Up, it may be still up between local primary and backup in a DR setup) + echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..." + messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)") + messagevpn_total=$(echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + + # Count message_vpns in-sync and compare with total + localmessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(//table[sync-state='In-Sync'])") + local_messagevpn_total_insync=$(echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then + echo "$(date) INFO: ${APP}-Config-sync state is not in-sync locally." + sleep ${pause} + continue + fi + + echo "$(date) INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..." + vpnremotehamate_result=$(get_router_remote_config_state "name") + + remote_messagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(//table/source-router[name='$vpnremotehamate_result'])") + remote_messagevpn_total=$(echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + + #Count message_vpns in-sync, not stale and compare with total + remotemessagevpn_result=$(/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \ + -q "" \ + -v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])") + remote_messagevpn_total_insync=$(echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -) + if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then + echo "$(date) INFO: ${APP}-Config-sync state is not in-sync for remote." + sleep ${pause} + continue + fi break - ;; - esac - ((count++)) - echo "$(date) INFO: ${APP}-Waited ${run_time} seconds, Config-sync is: ${confsyncstatus_results}, not yet Up" - sleep ${pause} - done - if [ ${count} -eq ${loop_guard} ]; then - echo "$(date) ERROR: ${APP}-Config-sync never reached state \"Up\"" >&2 - exit 1 - fi + done + if [ ${count} -eq ${loop_guard} ]; then + echo "$(date) ERROR: ${APP}-Config-sync never reached state \"Up\"" >&2 + exit 1 + fi fi # if not monitor fi echo "$(date) INFO: ${APP}-PubSub+ Event Broker bringup is complete for this node." @@ -185,4 +233,4 @@ echo "$(date) INFO: ${APP}-PubSub+ Event Broker bringup is complete for this nod if [[ ! -e ${INITIAL_STARTUP_FILE} ]]; then echo "PubSub+ Event Broker initial startup completed on $(date)" > ${INITIAL_STARTUP_FILE} fi -exit 0 +exit 0 \ No newline at end of file diff --git a/deploy/deploy.yaml b/deploy/deploy.yaml index 35cb87a..806c2e4 100644 --- a/deploy/deploy.yaml +++ b/deploy/deploy.yaml @@ -1881,7 +1881,7 @@ spec: env: - name: WATCH_NAMESPACE value: "" - image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.0 + image: docker.io/solace/pubsubplus-eventbroker-operator:1.0.1 imagePullPolicy: Always livenessProbe: httpGet: diff --git a/version.go b/version.go index 41a8082..fec03ba 100644 --- a/version.go +++ b/version.go @@ -15,4 +15,4 @@ // limitations under the License. package main -const version = "1.0.0" +const version = "1.0.1"