Skip to content

Commit

Permalink
✨ pod retention. (#737)
Browse files Browse the repository at this point in the history
Support pod retention settings.

The current policy is to delete pods as soon as completed (succeed or
failed).
Tackle users and support are used to troubleshooting by `oc debug` of
the task pods.
To support this, the task manager can terminate containers in pods as
needed and defer to the reaper to delete the pods. This would be
controlled new settings. By default succeeded tasks would be retained
their pods for 1 minute; failed tasks for 72 hours.
In all cases, failure to terminate running container will fallback to
deleting the pod immediately. The retention is best effort.

Running containers are terminated by `kill -p 1` This will only work for
linux containers.

---------

Signed-off-by: Jeff Ortel <[email protected]>
  • Loading branch information
jortel authored and dymurray committed Oct 11, 2024
1 parent e50b49a commit 69bff4b
Show file tree
Hide file tree
Showing 5 changed files with 238 additions and 38 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ require (
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/moby/spdystream v0.2.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbt
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/andygrunwald/go-jira v1.16.0 h1:PU7C7Fkk5L96JvPc6vDVIrd99vdPnYudHu4ju2c2ikQ=
github.com/andygrunwald/go-jira v1.16.0/go.mod h1:UQH4IBVxIYWbgagc0LF/k9FRs9xjIiQ8hIcC6HfLwFU=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
Expand All @@ -29,6 +30,7 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153 h1:yUdfgN0XgIJw7foRItutHYUIhlcKzcSf5vDpdhQAKTc=
github.com/emicklei/go-restful/v3 v3.9.0 h1:XwGDlfxEnQZzuopoqxwSEllNcCOM9DhhFyhFIIGKwxE=
github.com/emicklei/go-restful/v3 v3.9.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
Expand Down Expand Up @@ -121,6 +123,7 @@ github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU=
github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
Expand Down Expand Up @@ -162,6 +165,8 @@ github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo=
github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8=
github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
Expand Down
49 changes: 31 additions & 18 deletions reaper/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ type TaskReaper struct {
// - Pod is deleted after the defined period.
func (r *TaskReaper) Run() {
Log.V(1).Info("Reaping tasks.")
list := []model.Task{}
list := []task.Task{}
result := r.DB.Find(
&list,
"state IN ?",
[]string{
task.Created,
task.Succeeded,
task.Failed,
task.Canceled,
})
Log.Error(result.Error, "")
if result.Error != nil {
Expand Down Expand Up @@ -108,6 +109,10 @@ func (r *TaskReaper) Run() {
r.release(m)
}
}
d := time.Duration(Settings.Hub.Task.Pod.Retention.Succeeded) * Unit
if time.Since(mark) > d {
r.podDelete(m)
}
case task.Failed:
mark := m.CreateTime
if m.Terminated != nil {
Expand All @@ -124,23 +129,17 @@ func (r *TaskReaper) Run() {
r.release(m)
}
}
d := time.Duration(Settings.Hub.Task.Pod.Retention.Failed) * Unit
if time.Since(mark) > d {
r.podDelete(m)
}
}
}
}

// release resources.
func (r *TaskReaper) release(m *model.Task) {
// release bucket and file resources.
func (r *TaskReaper) release(m *task.Task) {
nChanged := 0
if m.Pod != "" {
rt := Task{Task: m}
err := rt.Delete(r.Client)
if err == nil {
m.Pod = ""
nChanged++
} else {
Log.Error(err, "")
}
}
if m.HasBucket() {
Log.Info("Task bucket released.", "id", m.ID)
m.SetBucket(nil)
Expand All @@ -151,8 +150,7 @@ func (r *TaskReaper) release(m *model.Task) {
nChanged++
}
if nChanged > 0 {
rt := task.Task{Task: m}
rt.Event(task.Released)
m.Event(task.Released)
err := r.DB.Save(m).Error
if err != nil {
Log.Error(err, "")
Expand All @@ -161,10 +159,25 @@ func (r *TaskReaper) release(m *model.Task) {
return
}

// podDelete deletes the task pod.
func (r *TaskReaper) podDelete(m *task.Task) {
if m.Pod == "" {
return
}
err := m.Delete(r.Client)
if err != nil {
Log.Error(err, "")
return
}
err = r.DB.Save(m).Error
if err != nil {
Log.Error(err, "")
}
}

// delete task.
func (r *TaskReaper) delete(m *model.Task) {
rt := Task{Task: m}
err := rt.Delete(r.Client)
func (r *TaskReaper) delete(m *task.Task) {
err := m.Delete(r.Client)
if err != nil {
Log.Error(err, "")
}
Expand Down
22 changes: 22 additions & 0 deletions settings/hub.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ const (
EnvTaskReapCreated = "TASK_REAP_CREATED"
EnvTaskReapSucceeded = "TASK_REAP_SUCCEEDED"
EnvTaskReapFailed = "TASK_REAP_FAILED"
EnvTaskPodRetainSucceeded = "TASK_POD_RETAIN_SUCCEEDED"
EnvTaskPodRetainFailed = "TASK_POD_RETAIN_FAILED"
EnvTaskSA = "TASK_SA"
EnvTaskRetries = "TASK_RETRIES"
EnvTaskPreemptEnabled = "TASK_PREEMPT_ENABLED"
Expand Down Expand Up @@ -84,6 +86,12 @@ type Hub struct {
Postponed time.Duration
Rate int
}
Pod struct {
Retention struct {
Succeeded int
Failed int
}
}
}
// Frequency
Frequency struct {
Expand Down Expand Up @@ -169,6 +177,20 @@ func (r *Hub) Load() (err error) {
} else {
r.Task.Reaper.Failed = 43200 // 720 hours (30 days).
}
s, found = os.LookupEnv(EnvTaskPodRetainSucceeded)
if found {
n, _ := strconv.Atoi(s)
r.Task.Pod.Retention.Succeeded = n
} else {
r.Task.Pod.Retention.Succeeded = 1
}
s, found = os.LookupEnv(EnvTaskPodRetainFailed)
if found {
n, _ := strconv.Atoi(s)
r.Task.Pod.Retention.Failed = n
} else {
r.Task.Pod.Retention.Failed = 4320 // 72 hours.
}
r.Task.SA, found = os.LookupEnv(EnvTaskSA)
if !found {
r.Task.SA = "tackle-hub"
Expand Down
Loading

0 comments on commit 69bff4b

Please sign in to comment.