From 43b56cd8b822f2b84386f3930941e1cf91f050b0 Mon Sep 17 00:00:00 2001
From: Gao Hongtao <hanahmily@gmail.com>
Date: Fri, 16 Aug 2024 00:34:03 +0000
Subject: [PATCH] Add failover test

Signed-off-by: Gao Hongtao <hanahmily@gmail.com>
---
 banyand/queue/sub/sub.go       |   5 +-
 docs/operation/cluster.md      |  41 +++++++++++-
 test/failover/Makefile         |  33 ++++++++++
 test/failover/README.md        | 110 ++++++++++++++++++++++++++++++++
 test/failover/check.sh         |  61 ++++++++++++++++++
 test/failover/kind.yaml        |  23 +++++++
 test/failover/oap-pod.yaml     |  77 +++++++++++++++++++++++
 test/failover/segment.tpl.json | 111 +++++++++++++++++++++++++++++++++
 test/failover/setup.md         |  49 +++++++++++++++
 9 files changed, 508 insertions(+), 2 deletions(-)
 create mode 100644 test/failover/Makefile
 create mode 100644 test/failover/README.md
 create mode 100755 test/failover/check.sh
 create mode 100644 test/failover/kind.yaml
 create mode 100644 test/failover/oap-pod.yaml
 create mode 100644 test/failover/segment.tpl.json
 create mode 100644 test/failover/setup.md

diff --git a/banyand/queue/sub/sub.go b/banyand/queue/sub/sub.go
index bcdc6ab16..c58b99c59 100644
--- a/banyand/queue/sub/sub.go
+++ b/banyand/queue/sub/sub.go
@@ -68,7 +68,10 @@ func (s *server) Send(stream clusterv1.Service_SendServer) error {
 			return nil
 		}
 		if err != nil {
-			if status.Code(err) == codes.Canceled {
+			// If the context is canceled or the deadline is exceeded, the stream will be closed.
+			// In this case, we should return nil to avoid logging the error.
+			// Deadline exceeded will be raised when other data nodes are not available, and the client timeout context is triggered.
+			if status.Code(err) == codes.Canceled || status.Code(err) == codes.DeadlineExceeded {
 				return nil
 			}
 			s.log.Error().Err(err).Msg("failed to receive message")
diff --git a/docs/operation/cluster.md b/docs/operation/cluster.md
index 86d63ffc2..3b82b9b3e 100644
--- a/docs/operation/cluster.md
+++ b/docs/operation/cluster.md
@@ -54,4 +54,43 @@ The new added data nodes can be automatically discovered by the existing liaison
 
 The cluster's availability is also improved by increasing the number of data nodes, as active data nodes need to handle a lower additional workload when some data nodes become unavailable. For example, if one node out of 2 nodes is unavailable, then 50% of the load is re-distributed across the remaining node, resulting in a 100% per-node workload increase. If one node out of 10 nodes is unavailable, then 10% of the load is re-distributed across the 9 remaining nodes, resulting in only an 11% per-node workload increase.
 
-Increasing the number of etcd nodes can increase the cluster's metadata capacity and improve the cluster's metadata query performance. It can also improve the cluster's metadata availability, as the metadata is replicated across all the etcd nodes. However, the cluster size should be odd to avoid split-brain situations.
\ No newline at end of file
+Increasing the number of etcd nodes can increase the cluster's metadata capacity and improve the cluster's metadata query performance. It can also improve the cluster's metadata availability, as the metadata is replicated across all the etcd nodes. However, the cluster size should be odd to avoid split-brain situations.
+
+The steps of adding more data nodes:
+
+1. Boot up the new data node. They will register themselves to the etcd cluster. The liaison nodes will discover the new data node automatically.
+2. If the shards are not balanced, the new data node will receive the shards from the existing data nodes. The shards are balanced automatically.
+3. Or if the shards are too few to balance, more shards should be created by increasing `shard_num` of the `group`. Seeing the [CRUD Groups](../interacting/bydbctl/schema/group.md) for more details.
+4. The new data node will start to ingest data and serve queries.
+
+Here is the polished document with a formal writing pattern:
+
+## Availability
+
+The BanyanDB cluster remains available for data ingestion and data querying even if some of its components are temporarily unavailable.
+
+### Liaison Node Failure
+
+In the event of a liaison node failure, the cluster remains available when the gRPC load balancer can stop sending requests to the failed liaison node and start sending requests to the remaining liaison nodes. The failed liaison node is replaced by the remaining liaison nodes, and the cluster continues to ingest data and serve queries. However, if the remaining liaison nodes are overloaded, the cluster might face performance degradation.
+
+It is recommended to monitor the cluster's performance and add more liaison nodes in case of performance degradation. A workload management platform, such as Kubernetes, can be used to automatically scale the liaison nodes based on the cluster's performance metrics.
+
+### Data Node Failure
+
+If a data node fails, the cluster remains available. The failed data node is replaced by the remaining data nodes, and the cluster continues to ingest new data and serve queries. If the remaining data nodes are overloaded, the cluster might face performance degradation.
+
+The liaison nodes automatically discover the failed data node through the etcd cluster. They will perform a health check on the failed data node. If the failed data node is not healthy, the liaison nodes will stop sending requests to the failed data node and start sending requests to the remaining data nodes. Otherwise, the liaison nodes will continue sending requests to the failed data node in case of a temporary failure between the etcd cluster and the data node.
+
+Liaison nodes continue serving queries if at least one data node is available. However, the responses might lose some data points that are stored in the failed data node. The lost data points are automatically recovered when the failed data node is back online.
+
+The client might face a "grpc: the client connection is closing" error temporarily when the liaison nodes are switching the requests from the failed data node to the remaining data nodes. The client should retry the request in case of this error.
+
+A workload management platform, such as Kubernetes, can be used to automatically scale the data nodes based on the cluster's performance metrics. But the shard number of the group should be increased manually. A proper practice is to set a expected maximum shard number for the group when creating the group. The shard number should match the maximum number of data nodes that the group can have.
+
+### etcd Node Failure
+
+If an etcd node fails, the cluster can still ingest new data and serve queries of `Stream` and `Measure`. `Property` operations are not available during the etcd node failure.
+
+When the etcd node is back online, the cluster automatically recovers without any manual intervention. If the etcd cluster lost the data, the client should rerun the metadata initialization process to recover the metadata.
+
+You might see some etcd-related errors in the logs of the liaison nodes and data nodes. These errors are automatically recovered when the failed etcd node is back online.
diff --git a/test/failover/Makefile b/test/failover/Makefile
new file mode 100644
index 000000000..f69bd63f7
--- /dev/null
+++ b/test/failover/Makefile
@@ -0,0 +1,33 @@
+# Licensed to Apache Software Foundation (ASF) under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Apache Software Foundation (ASF) licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+QPS ?= 10
+
+GROUP ?= "default"
+
+.PHONY: up_traffic
+up_traffic:
+	curl -XPOST 'http://localhost:12800/mock-data/segments/tasks?qps=$(QPS)&group=$(GROUP)' -H'Content-Type: application/json' -d "@segment.tpl.json"
+
+.PHONY: ls_traffic
+ls_traffic:
+	curl -XGET 'http://localhost:12800/mock-data/segments/tasks'
+
+.PHONY: rm_traffic
+rm_traffic:
+	curl -XDELETE 'http://localhost:12800/mock-data/segments/tasks'
\ No newline at end of file
diff --git a/test/failover/README.md b/test/failover/README.md
new file mode 100644
index 000000000..379df5df3
--- /dev/null
+++ b/test/failover/README.md
@@ -0,0 +1,110 @@
+# Failover and Resilience Test
+
+## Setup the Cluster
+
+See [Setup the Cluster](setup.md).
+
+## Case 1: Liaison Node Failure
+
+### Steps to simulate a liaison node failure
+
+1. Add an annotation "failover-try=1" to the Liaison pod to simulate a failure.
+2. A new Liaison pod will be created, and the old Liaison pod will be in the `Terminating` state.
+3. Check the status of the Liaison pods and OAP console.
+4. Check write and query operations.
+
+### Result of the liaison node failure
+
+- The first Liaison pod is in the `Terminating` state.
+- The second Liaison pod is in the `Running` state.
+- The cluster is still available.
+- The trace and metrics(5 services) write and read operations are still available.
+
+## Case 2: Data Node Failure
+
+### Steps to simulate a data node failure
+
+1. Scale the Data pod to 3 replicas. They are `banyandb-0`, `banyandb-1`, and `banyandb-2`.
+2. Scale the Data pod to 2 replica. `banyandb-2` pod will be terminated.
+3. Check the status of the Data pods, OAP console, and Liaison console.
+4. Check write and query operations.
+
+### Result of the data node failure
+
+- The `banyandb-1` pod is in the `Terminating` state.
+- The cluster is still available.
+- OAP might face "fail to execute the query plan for measure events_minute: broadcast errors: failed to publish message to 10. │
+│ 244.0.76:17912: failed to get stream for node 10.244.0.76:17912: rpc error: code = Canceled desc = grpc: the client connection is closing: invalid query message" error.
+- The trace and metrics(5 services) write and read operations are still available.
+- Partial data loss might occur as the `banyandb-2` is down.
+
+```yaml
+2024-08-15 0609:
+  value: 0
+  isemptyvalue: true
+2024-08-15 0610:
+  value: 0
+  isemptyvalue: true
+2024-08-15 0611:
+  value: 0
+  isemptyvalue: true
+2024-08-15 0612:
+  value: 0
+  isemptyvalue: true
+2024-08-15 0613:
+  value: 549
+  isemptyvalue: false
+2024-08-15 0614:
+  value: 541
+  isemptyvalue: false
+2024-08-15 0615:
+  value: 566
+  isemptyvalue: false
+2024-08-15 0616:
+  value: 546
+  isemptyvalue: false
+```
+
+## Case 3: etcd Node Failure
+
+### Steps to simulate an etcd node failure
+
+1. Scale the etcd pod to 0 replicas.
+2. Check the status of the OAP, Data and Liaison console.
+3. Check write and query operations.
+
+## Result of the etcd node failure
+
+1. Liaison and Data pods are available, but will raise an error.
+
+```json
+{"level":"warn","ts":1723709128.2490797,"caller":"v3@v3.5.13/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00049e1e0/failover-test-etcd-0.failover-test-etcd-headless.default:2379","attempt":0,"error":"rpc error: code = DeadlineExceeded desc = latest balancer error: last connection error: connection error: desc = \"transport: Error while dialing: dial tcp 10.96.126.15:2379: connect: connection refused\""}
+{"level":"error","module":"ETCD","error":"context deadline exceeded","time":"2024-08-15T08:05:28Z","message":"failed to revoke lease 8287064579165108153"}                                                                                                                            
+{"level":"warn","ts":1723709216.6529357,"caller":"v3@v3.5.13/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00049e1e0/failover-test-etcd-0.failover-test-etcd-headless.default:2379","attempt":0,"error":"rpc error: code = DeadlineExceeded desc = latest balancer error: last connection error: connection error: desc = \"transport: Error while dialing: dial tcp: lookup failover-test-etcd-0.failover-test-etcd-headless.default.svc.cluster.local on 10.96.0.10:53: no such host\""}
+{"level":"info","ts":1723709216.653035,"caller":"v3@v3.5.13/client.go:210","msg":"Auto sync endpoints failed.","error":"context deadline exceeded"} 
+```
+
+2. The trace and metrics(5 services) write and read operations are still available.
+3. `swctl menu get`
+
+## Case 4: etcd Node recovery
+
+### Steps to recover the etcd node.
+
+1. Scale the etcd pod to 1 replica.
+2. Check the status of the OAP, Data and Liaison console.
+3. Check write and query operations.
+
+## Result of the etcd node recovery with the correct data
+
+1. Liaison and Data pods are available, and their consoles will show:
+
+```json
+ {"level":"warn","ts":1723710245.1049383,"caller":"v3@v3.5.13/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00049e1e0/failover-test-etcd-0.failover-test-etcd-headless.default:2379","attempt":0,"error":"rpc error: code = Unauthenticated desc = etcdserver: invalid auth token"} 
+```
+
+The message means that the client's token is invalid. The client should re-authenticate with the correct token and reconnect.
+
+2. The trace and metrics(5 services) write and read operations are still available.
+3. `swctl menu get` will return data as expected.
+4. Add a new Data node, the liaison will automatically add the new Data node to the route table.
diff --git a/test/failover/check.sh b/test/failover/check.sh
new file mode 100755
index 000000000..4857f5118
--- /dev/null
+++ b/test/failover/check.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# List of service IDs
+# service_0 to service_4
+service_ids=("ImRlZmF1bHQiOjpzZXJ2aWNlXzQ=.1" "ImRlZmF1bHQiOjpzZXJ2aWNlXzM=.1" "ImRlZmF1bHQiOjpzZXJ2aWNlXzI=.1" "ImRlZmF1bHQiOjpzZXJ2aWNlXzE=.1" "ImRlZmF1bHQiOjpzZXJ2aWNlXzA=.1")
+
+error_num=0
+# Iterate over each service ID
+for service_id in "${service_ids[@]}"; do
+  echo "Checking service ID: $service_id"
+
+  # Run the swctl command
+  swctl_result=$(swctl --display json metrics linear --name=service_resp_time --service-id="$service_id")
+
+  # Check if swctl result is not an empty list
+  if [ "$swctl_result" == "[]" ]; then
+    echo "The swctl result is an empty list for service ID: $service_id. Skipping jq processing."
+    error_num=$((error_num + 1))
+  else
+    # Filter with jq
+    result=$(echo "$swctl_result" | jq 'map(select(.IsEmptyValue == true))')
+
+    # Check if the result is an empty list
+    if [ "$result" != "[]" ]; then
+      echo "The result is not an empty list. Some items have IsEmptyValue set to true for service ID: $service_id."
+      error_num=$((error_num + 1))
+    fi
+  fi
+
+  trace_result=$(swctl t ls --service-id="$service_id" | jq -e '.traces | length > 0')
+
+  if [ "$trace_result" == "false" ]; then
+    echo "No traces found for service ID: $service_id."
+    error_num=$((error_num + 1))
+  fi
+
+  echo
+done
+
+if [ "$error_num" -gt 0 ]; then
+  echo "Some service IDs failed the check."
+  exit 1
+fi
+echo "All service IDs passed the check."
diff --git a/test/failover/kind.yaml b/test/failover/kind.yaml
new file mode 100644
index 000000000..cc8090f4a
--- /dev/null
+++ b/test/failover/kind.yaml
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+- role: control-plane
+  extraPortMappings:
+  - containerPort: 12800
+    hostPort: 12800
+    protocol: TCP
diff --git a/test/failover/oap-pod.yaml b/test/failover/oap-pod.yaml
new file mode 100644
index 000000000..3159ae596
--- /dev/null
+++ b/test/failover/oap-pod.yaml
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    component: oap
+  name: data-generator
+  namespace: default
+spec:
+  containers:
+  - env:
+    - name: JAVA_OPTS
+      value: -Xmx2g -Xms2g
+    - name: SW_STORAGE
+      value: banyandb
+    - name: SW_STORAGE_BANYANDB_TARGETS
+      value: banyandb-grpc:17912
+    image: ghcr.io/apache/skywalking/data-generator:9b17ff1efeab7a20c870839f59eb0e6af485cd3f
+    imagePullPolicy: IfNotPresent
+    livenessProbe:
+      failureThreshold: 3
+      initialDelaySeconds: 5
+      periodSeconds: 10
+      successThreshold: 1
+      tcpSocket:
+        port: 12800
+      timeoutSeconds: 1
+    name: oap
+    ports:
+    - containerPort: 11800
+      name: grpc
+      protocol: TCP
+    - containerPort: 12800
+      name: rest
+      protocol: TCP
+      hostPort: 12800
+    readinessProbe:
+      failureThreshold: 3
+      initialDelaySeconds: 5
+      periodSeconds: 10
+      successThreshold: 1
+      tcpSocket:
+        port: 12800
+      timeoutSeconds: 1
+    resources: {}
+    startupProbe:
+      failureThreshold: 9
+      periodSeconds: 10
+      successThreshold: 1
+      tcpSocket:
+        port: 12800
+      timeoutSeconds: 1
+  dnsPolicy: ClusterFirst
+  enableServiceLinks: true
+  initContainers:
+  - command:
+    - sh
+    - -c
+    - for i in $(seq 1 60); do curl banyandb-http:17913/api/healthz && exit 0 || sleep 5; done; exit 1
+    image: curlimages/curl
+    imagePullPolicy: IfNotPresent
+    name: wait-for-banyandb
+    resources: {}
diff --git a/test/failover/segment.tpl.json b/test/failover/segment.tpl.json
new file mode 100644
index 000000000..2f4fe81da
--- /dev/null
+++ b/test/failover/segment.tpl.json
@@ -0,0 +1,111 @@
+{
+  "traceId": {
+    "type": "uuid",
+    "changingFrequency": "1"
+  },
+  "serviceInstanceName": {
+    "type": "randomString",
+    "length": "10",
+    "letters": true,
+    "numbers": true,
+    "domainSize": 10
+  },
+  "serviceName": {
+    "type": "fixedString",
+    "value": "service_"
+  },
+  "segments": {
+    "type": "randomList",
+    "size": 5,
+    "item": {
+      "endpointName": {
+        "type": "randomString",
+        "length": "10",
+        "prefix": "test_",
+        "letters": true,
+        "numbers": true,
+        "domainSize": 10
+      },
+      "error": {
+        "type": "randomInt",
+        "min": 1,
+        "max": 1
+      },
+      "now": {
+        "type": "time",
+        "stepMillisecond": 1000,
+        "waitMillisecond": 1000
+      },
+      "tags": {
+        "type": "randomList",
+        "size": 5,
+        "item": {
+          "key": {
+            "type": "randomString",
+            "length": "10",
+            "prefix": "test_tag_",
+            "letters": true,
+            "numbers": true,
+            "domainSize": 5
+          },
+          "value": {
+            "type": "randomString",
+            "length": "10",
+            "prefix": "test_value_",
+            "letters": true,
+            "numbers": true,
+            "domainSize": 10
+          }
+        }
+      },
+      "spans": {
+        "type": "randomList",
+        "size": 5,
+        "item": {
+          "latency": {
+            "type": "randomInt",
+            "min": 100,
+            "max": 1000
+          },
+          "operationName": {
+            "type": "randomString",
+            "length": "10",
+            "prefix": "test_endpoint_",
+            "letters": true,
+            "numbers": true
+          },
+          "componentId": {
+            "type": "randomInt",
+            "min": "0",
+            "max": "4"
+          },
+          "error": {
+            "type": "randomBool",
+            "possibility": "0.2"
+          },
+          "tags": {
+            "type": "randomList",
+            "size": 5,
+            "item": {
+              "key": {
+                "type": "randomString",
+                "length": "10",
+                "prefix": "test_tag_key_",
+                "letters": true,
+                "numbers": true,
+                "domainSize": 10
+              },
+              "value": {
+                "type": "randomString",
+                "length": "10",
+                "prefix": "test_tag_val_",
+                "letters": true,
+                "numbers": true
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/test/failover/setup.md b/test/failover/setup.md
new file mode 100644
index 000000000..99fe759d0
--- /dev/null
+++ b/test/failover/setup.md
@@ -0,0 +1,49 @@
+# Setup the Cluster
+
+## Provisioning the KinD cluster
+
+```bash
+kind create cluster --config kind.yaml
+
+kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
+kubectl patch -n kube-system deployment metrics-server --type=json \
+  -p '[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--kubelet-insecure-tls"}]'
+```
+
+## Build BanyanDB and Load Image into KinD
+
+```bash
+make docker.build
+kind load docker-image apache/skywalking-banyandb:latest
+```
+
+## Deploy BanyanDB
+
+```bash
+helm registry login registry-1.docker.io
+
+helm install "failover-test" \
+  oci://ghcr.io/apache/skywalking-banyandb-helm/skywalking-banyandb-helm \
+  --version "0.0.0-973f59b" \
+  -n "default" \
+  --set image.repository=apache/skywalking-banyandb \
+  --set image.tag=latest \
+  --set standalone.enabled=false \
+  --set cluster.enabled=true \
+  --set cluster.liaison.replicas=1 \
+  --set cluster.data.replicas=1 \
+  --set etcd.enabled=true \
+  --set etcd.replicaCount=1
+```
+
+## Deploy Data Generator
+
+```bash
+kubectl apply -f oap-pod.yaml
+```
+
+## Trigger Data Generation
+
+```bash
+make up_traffic
+```
\ No newline at end of file