Skip to content

Commit

Permalink
handle worker failures gracefully (#1038)
Browse files Browse the repository at this point in the history
* handle failures gracefully

* fix super-subtle race condition

* address feedback: panic instead of fatal log and make vars into consts

* pass the frankenphp context to worker-ready function

* reset backoff and failures on normal restart

* update docs

* add test and fix race condition

* fail sometimes but do not be pathological about it

* Use title case

Co-authored-by: Kévin Dunglas <[email protected]>

* fix code style in php

* define lifecycle metrics

* ensure we update unregister the metrics and fix tests

* update caddy tests and fix typo

* update docs

* no need for this

---------

Co-authored-by: Kévin Dunglas <[email protected]>
  • Loading branch information
withinboredom and dunglas authored Oct 3, 2024
1 parent b8e5ad1 commit aa585f7
Show file tree
Hide file tree
Showing 10 changed files with 237 additions and 8 deletions.
30 changes: 30 additions & 0 deletions caddy/caddy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,18 @@ func TestWorkerMetrics(t *testing.T) {
# HELP frankenphp_testdata_index_php_worker_request_count
# TYPE frankenphp_testdata_index_php_worker_request_count counter
frankenphp_testdata_index_php_worker_request_count 10
# HELP frankenphp_testdata_index_php_ready_workers Running workers that have successfully called frankenphp_handle_request at least once
# TYPE frankenphp_testdata_index_php_ready_workers gauge
frankenphp_testdata_index_php_ready_workers 2
# HELP frankenphp_testdata_index_php_worker_crashes Number of PHP worker crashes for this worker
# TYPE frankenphp_testdata_index_php_worker_crashes counter
frankenphp_testdata_index_php_worker_crashes 0
# HELP frankenphp_testdata_index_php_worker_restarts Number of PHP worker restarts for this worker
# TYPE frankenphp_testdata_index_php_worker_restarts counter
frankenphp_testdata_index_php_worker_restarts 0
`

require.NoError(t,
Expand All @@ -456,6 +468,9 @@ func TestWorkerMetrics(t *testing.T) {
"frankenphp_testdata_index_php_busy_workers",
"frankenphp_testdata_index_php_total_workers",
"frankenphp_testdata_index_php_worker_request_count",
"frankenphp_testdata_index_php_worker_crashes",
"frankenphp_testdata_index_php_worker_restarts",
"frankenphp_testdata_index_php_ready_workers",
))
}

Expand Down Expand Up @@ -531,6 +546,18 @@ func TestAutoWorkerConfig(t *testing.T) {
# HELP frankenphp_testdata_index_php_worker_request_count
# TYPE frankenphp_testdata_index_php_worker_request_count counter
frankenphp_testdata_index_php_worker_request_count 10
# HELP frankenphp_testdata_index_php_ready_workers Running workers that have successfully called frankenphp_handle_request at least once
# TYPE frankenphp_testdata_index_php_ready_workers gauge
frankenphp_testdata_index_php_ready_workers ` + workers + `
# HELP frankenphp_testdata_index_php_worker_crashes Number of PHP worker crashes for this worker
# TYPE frankenphp_testdata_index_php_worker_crashes counter
frankenphp_testdata_index_php_worker_crashes 0
# HELP frankenphp_testdata_index_php_worker_restarts Number of PHP worker restarts for this worker
# TYPE frankenphp_testdata_index_php_worker_restarts counter
frankenphp_testdata_index_php_worker_restarts 0
`

require.NoError(t,
Expand All @@ -542,5 +569,8 @@ func TestAutoWorkerConfig(t *testing.T) {
"frankenphp_testdata_index_php_busy_workers",
"frankenphp_testdata_index_php_total_workers",
"frankenphp_testdata_index_php_worker_request_count",
"frankenphp_testdata_index_php_worker_crashes",
"frankenphp_testdata_index_php_worker_restarts",
"frankenphp_testdata_index_php_ready_workers",
))
}
3 changes: 3 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ When [Caddy metrics](https://caddyserver.com/docs/metrics) are enabled, FrankenP
* `frankenphp_[worker]_busy_workers`: The number of workers currently processing a request.
* `frankenphp_[worker]_worker_request_time`: The time spent processing requests by all workers.
* `frankenphp_[worker]_worker_request_count`: The number of requests processed by all workers.
* `frankenphp_[worker]_ready_workers`: The number of workers that have called `frankenphp_handle_request` at least once.
* `frankenphp_[worker]_worker_crashes`: The number of times a worker has unexpectedly terminated.
* `frankenphp_[worker]_worker_restarts`: The number of times a worker has been deliberately restarted.
* `frankenphp_total_threads`: The total number of PHP threads.
* `frankenphp_busy_threads`: The number of PHP threads currently processing a request (running workers always consume a thread).

Expand Down
8 changes: 8 additions & 0 deletions docs/worker.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ A workaround to using this type of code in worker mode is to restart the worker

The previous worker snippet allows configuring a maximum number of request to handle by setting an environment variable named `MAX_REQUESTS`.

### Worker Failures

If a worker script crashes with a non-zero exit code, FrankenPHP will restart it with an exponential backoff strategy.
If the worker script stays up longer than the last backoff * 2,
it will not penalize the worker script and restart it again.
However, if the worker script continues to fail with a non-zero exit code in a short period of time
(for example, having a typo in a script), FrankenPHP will crash with the error: `too many consecutive failures`.

## Superglobals Behavior

[PHP superglobals](https://www.php.net/manual/en/language.variables.superglobals.php) (`$_SERVER`, `$_ENV`, `$_GET`...)
Expand Down
2 changes: 1 addition & 1 deletion frankenphp.c
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ PHP_FUNCTION(frankenphp_handle_request) {
ctx->worker_ready = true;

/* Mark the worker as ready to handle requests */
go_frankenphp_worker_ready();
go_frankenphp_worker_ready(ctx->main_request);
}

#ifdef ZEND_MAX_EXECUTION_TIMERS
Expand Down
3 changes: 3 additions & 0 deletions frankenphp.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ type FrankenPHPContext struct {
// Whether the request is already closed by us
closed sync.Once

// whether the context is ready to receive requests
ready bool

responseWriter http.ResponseWriter
exitStatus C.int

Expand Down
12 changes: 12 additions & 0 deletions frankenphp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,18 @@ func testRequestHeaders(t *testing.T, opts *testOptions) {
}, opts)
}

func TestFailingWorker(t *testing.T) {
runTest(t, func(handler func(http.ResponseWriter, *http.Request), _ *httptest.Server, i int) {
req := httptest.NewRequest("GET", "http://example.com/failing-worker.php", nil)
w := httptest.NewRecorder()
handler(w, req)

resp := w.Result()
body, _ := io.ReadAll(resp.Body)
assert.Contains(t, string(body), "ok")
}, &testOptions{workerScript: "failing-worker.php"})
}

func TestFileUpload_module(t *testing.T) { testFileUpload(t, &testOptions{}) }
func TestFileUpload_worker(t *testing.T) {
testFileUpload(t, &testOptions{workerScript: "file-upload.php"})
Expand Down
87 changes: 84 additions & 3 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,21 @@ import (
var metricsNameRegex = regexp.MustCompile(`\W+`)
var metricsNameFixRegex = regexp.MustCompile(`^_+|_+$`)

const (
StopReasonCrash = iota
StopReasonRestart
StopReasonShutdown
)

type StopReason int

type Metrics interface {
// StartWorker collects started workers
StartWorker(name string)
// ReadyWorker collects ready workers
ReadyWorker(name string)
// StopWorker collects stopped workers
StopWorker(name string)
StopWorker(name string, reason StopReason)
// TotalWorkers collects expected workers
TotalWorkers(name string, num int)
// TotalThreads collects total threads
Expand All @@ -36,7 +46,10 @@ type nullMetrics struct{}
func (n nullMetrics) StartWorker(name string) {
}

func (n nullMetrics) StopWorker(name string) {
func (n nullMetrics) ReadyWorker(name string) {
}

func (n nullMetrics) StopWorker(name string, reason StopReason) {
}

func (n nullMetrics) TotalWorkers(name string, num int) {
Expand Down Expand Up @@ -66,6 +79,9 @@ type PrometheusMetrics struct {
busyThreads prometheus.Gauge
totalWorkers map[string]prometheus.Gauge
busyWorkers map[string]prometheus.Gauge
readyWorkers map[string]prometheus.Gauge
workerCrashes map[string]prometheus.Counter
workerRestarts map[string]prometheus.Counter
workerRequestTime map[string]prometheus.Counter
workerRequestCount map[string]prometheus.Counter
mu sync.Mutex
Expand All @@ -81,14 +97,31 @@ func (m *PrometheusMetrics) StartWorker(name string) {
m.totalWorkers[name].Inc()
}

func (m *PrometheusMetrics) StopWorker(name string) {
func (m *PrometheusMetrics) ReadyWorker(name string) {
if _, ok := m.totalWorkers[name]; !ok {
return
}

m.readyWorkers[name].Inc()
}

func (m *PrometheusMetrics) StopWorker(name string, reason StopReason) {
m.busyThreads.Dec()

// tests do not register workers before starting them
if _, ok := m.totalWorkers[name]; !ok {
return
}
m.totalWorkers[name].Dec()
m.readyWorkers[name].Dec()

if reason == StopReasonCrash {
m.workerCrashes[name].Inc()
} else if reason == StopReasonRestart {
m.workerRestarts[name].Inc()
} else if reason == StopReasonShutdown {
m.totalWorkers[name].Dec()
}
}

func (m *PrometheusMetrics) getIdentity(name string) (string, error) {
Expand Down Expand Up @@ -122,6 +155,36 @@ func (m *PrometheusMetrics) TotalWorkers(name string, num int) {
m.registry.MustRegister(m.totalWorkers[identity])
}

if _, ok := m.workerCrashes[identity]; !ok {
m.workerCrashes[identity] = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "frankenphp",
Subsystem: subsystem,
Name: "worker_crashes",
Help: "Number of PHP worker crashes for this worker",
})
m.registry.MustRegister(m.workerCrashes[identity])
}

if _, ok := m.workerRestarts[identity]; !ok {
m.workerRestarts[identity] = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "frankenphp",
Subsystem: subsystem,
Name: "worker_restarts",
Help: "Number of PHP worker restarts for this worker",
})
m.registry.MustRegister(m.workerRestarts[identity])
}

if _, ok := m.readyWorkers[identity]; !ok {
m.readyWorkers[identity] = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "frankenphp",
Subsystem: subsystem,
Name: "ready_workers",
Help: "Running workers that have successfully called frankenphp_handle_request at least once",
})
m.registry.MustRegister(m.readyWorkers[identity])
}

if _, ok := m.busyWorkers[identity]; !ok {
m.busyWorkers[identity] = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "frankenphp",
Expand Down Expand Up @@ -200,6 +263,18 @@ func (m *PrometheusMetrics) Shutdown() {
m.registry.Unregister(c)
}

for _, c := range m.workerCrashes {
m.registry.Unregister(c)
}

for _, c := range m.workerRestarts {
m.registry.Unregister(c)
}

for _, g := range m.readyWorkers {
m.registry.Unregister(g)
}

m.totalThreads = prometheus.NewCounter(prometheus.CounterOpts{
Name: "frankenphp_total_threads",
Help: "Total number of PHP threads",
Expand All @@ -212,6 +287,9 @@ func (m *PrometheusMetrics) Shutdown() {
m.busyWorkers = map[string]prometheus.Gauge{}
m.workerRequestTime = map[string]prometheus.Counter{}
m.workerRequestCount = map[string]prometheus.Counter{}
m.workerRestarts = map[string]prometheus.Counter{}
m.workerCrashes = map[string]prometheus.Counter{}
m.readyWorkers = map[string]prometheus.Gauge{}

m.registry.MustRegister(m.totalThreads)
m.registry.MustRegister(m.busyThreads)
Expand Down Expand Up @@ -243,6 +321,9 @@ func NewPrometheusMetrics(registry prometheus.Registerer) *PrometheusMetrics {
busyWorkers: map[string]prometheus.Gauge{},
workerRequestTime: map[string]prometheus.Counter{},
workerRequestCount: map[string]prometheus.Counter{},
workerRestarts: map[string]prometheus.Counter{},
workerCrashes: map[string]prometheus.Counter{},
readyWorkers: map[string]prometheus.Gauge{},
}

m.registry.MustRegister(m.totalThreads)
Expand Down
3 changes: 3 additions & 0 deletions metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ func createPrometheusMetrics() *PrometheusMetrics {
busyWorkers: make(map[string]prometheus.Gauge),
workerRequestTime: make(map[string]prometheus.Counter),
workerRequestCount: make(map[string]prometheus.Counter),
workerCrashes: make(map[string]prometheus.Counter),
workerRestarts: make(map[string]prometheus.Counter),
readyWorkers: make(map[string]prometheus.Gauge),
mu: sync.Mutex{},
}
}
Expand Down
18 changes: 18 additions & 0 deletions testdata/failing-worker.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?php

$fail = random_int(1, 100) < 1;
$wait = random_int(1, 5);

sleep($wait);
if ($fail) {
exit(1);
}

while (frankenphp_handle_request(function () {
echo "ok";
})) {
$fail = random_int(1, 100) < 10;
if ($fail) {
exit(1);
}
}
Loading

0 comments on commit aa585f7

Please sign in to comment.