Skip to content

Commit 3912511

Browse files
committed
Send script kill to busy nodes
1 parent 0db4115 commit 3912511

File tree

5 files changed

+51
-0
lines changed

5 files changed

+51
-0
lines changed

internal/app/repair.go

+6
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,12 @@ func (app *App) repairLocalNode(master string) bool {
143143
app.nodeFailTime[local.FQDN()] = time.Now()
144144
}
145145
failedTime := time.Since(app.nodeFailTime[local.FQDN()])
146+
if failedTime > app.config.Valkey.BusyTimeout && strings.HasPrefix(err.Error(), "BUSY ") {
147+
err = local.ScriptKill(app.ctx)
148+
if err != nil {
149+
app.logger.Error("Local node is busy running a script. But SCRIPT KILL failed", "error", err)
150+
}
151+
}
146152
if failedTime > app.config.Valkey.RestartTimeout && !strings.HasPrefix(err.Error(), "LOADING ") {
147153
app.nodeFailTime[local.FQDN()] = time.Now()
148154
err = local.Restart(app.ctx)

internal/config/config.go

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type ValkeyConfig struct {
2525
WriteTimeout time.Duration `yaml:"write_timeout"`
2626
DNSTTL time.Duration `yaml:"dns_ttl"`
2727
FailoverTimeout time.Duration `yaml:"failover_timeout"`
28+
BusyTimeout time.Duration `yaml:"busy_timeout"`
2829
Port int `yaml:"port"`
2930
RestartTimeout time.Duration `yaml:"restart_timeout"`
3031
WaitReplicationTimeout time.Duration `yaml:"wait_replication_timeout"`
@@ -99,6 +100,7 @@ func DefaultValkeyConfig() ValkeyConfig {
99100
WaitPoisonPillTimeout: 30 * time.Second,
100101
StaleReplicaLagClose: 60 * time.Second,
101102
StaleReplicaLagOpen: 10 * time.Second,
103+
BusyTimeout: 5 * time.Second,
102104
MaxParallelSyncs: 1,
103105
AllowDataLoss: false,
104106
TurnBeforeSwitchover: false,

internal/valkey/node.go

+9
Original file line numberDiff line numberDiff line change
@@ -703,3 +703,12 @@ func (n *Node) HasClusterSlots(ctx context.Context) (bool, error) {
703703
}
704704
return false, nil
705705
}
706+
707+
// ScriptKill kills a running script if node is in BUSY state
708+
func (n *Node) ScriptKill(ctx context.Context) error {
709+
err := n.ensureConn()
710+
if err != nil {
711+
return err
712+
}
713+
return n.conn.Do(ctx, n.conn.B().ScriptKill().Build()).Error()
714+
}

tests/features/07_cluster_local_repair.feature

+17
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
Feature: Cluster mode local node repair
22

3+
Scenario: Busy cluster node gets a SCRIPT KILL
4+
Given clustered shard is up and running
5+
Then valkey host "valkey1" should be master
6+
And valkey host "valkey2" should become replica of "valkey1" within "15" seconds
7+
And replication on valkey host "valkey2" should run fine within "15" seconds
8+
And valkey host "valkey3" should become replica of "valkey1" within "15" seconds
9+
And replication on valkey host "valkey3" should run fine within "15" seconds
10+
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
11+
"""
12+
["valkey1","valkey2","valkey3"]
13+
"""
14+
When I run async command on host "valkey1"
15+
"""
16+
valkey-cli -a functestpassword eval 'while true do end' 0
17+
"""
18+
Then valkey host "valkey1" should become available within "60" seconds
19+
320
Scenario: Cluster mode replica is restarted after OOM
421
Given clustered shard is up and running
522
Then valkey host "valkey1" should be master

tests/features/07_sentinel_local_repair.feature

+17
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,23 @@ Feature: Sentinel mode local node repair
1717
"""
1818
Then senticache host "valkey1" should have master "valkey1" within "30" seconds
1919

20+
Scenario: Busy sentinel mode node gets a SCRIPT KILL
21+
Given sentinel shard is up and running
22+
Then valkey host "valkey1" should be master
23+
And valkey host "valkey2" should become replica of "valkey1" within "15" seconds
24+
And replication on valkey host "valkey2" should run fine within "15" seconds
25+
And valkey host "valkey3" should become replica of "valkey1" within "15" seconds
26+
And replication on valkey host "valkey3" should run fine within "15" seconds
27+
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
28+
"""
29+
["valkey1","valkey2","valkey3"]
30+
"""
31+
When I run async command on host "valkey1"
32+
"""
33+
valkey-cli -a functestpassword eval 'while true do end' 0
34+
"""
35+
Then valkey host "valkey1" should become available within "60" seconds
36+
2037
Scenario: Sentinel mode replica is restarted after OOM
2138
Given sentinel shard is up and running
2239
Then valkey host "valkey1" should be master

0 commit comments

Comments
 (0)