Skip to content

Commit

Permalink
Add timeouts for cluster metadata initialization and for init contain…
Browse files Browse the repository at this point in the history
…ers (#218)

- Add timeouts for waiting for zk and bk to become available.
- If the waiting gets stuck for some reason, the Pulsar deployment never
  becomes starts the broker services.
  - timeouts will help failures recover eventually
  • Loading branch information
lhotari authored Jun 20, 2024
1 parent 023f902 commit 70f36ff
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 27 deletions.
2 changes: 1 addition & 1 deletion charts/pulsar/templates/_autorecovery.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Define autorecovery init container : verify cluster id
{{- define "pulsar.autorecovery.init.verify_cluster_id" -}}
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.autorecovery.zookeeper.tls.settings" . -}}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
{{- end }}
4 changes: 2 additions & 2 deletions charts/pulsar/templates/_bookkeeper.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ Define bookie init container : verify cluster id
{{- if not (and .Values.volumes.persistence .Values.bookkeeper.volumes.persistence) }}
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.bookkeeper.zookeeper.tls.settings" . -}}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
bin/bookkeeper shell bookieformat -nonInteractive -force -deleteCookie || true
Expand All @@ -133,7 +133,7 @@ bin/bookkeeper shell bookieformat -nonInteractive -force -deleteCookie || true
set -e;
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.bookkeeper.zookeeper.tls.settings" . -}}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
{{- end }}
Expand Down
4 changes: 3 additions & 1 deletion charts/pulsar/templates/autorecovery-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,14 @@ spec:
terminationGracePeriodSeconds: {{ .Values.autorecovery.gracePeriod }}
serviceAccountName: "{{ template "pulsar.fullname" . }}-{{ .Values.autorecovery.component }}"
initContainers:
{{- if and .Values.autorecovery.waitBookkeeperTimeout (not (eq (.Values.autorecovery.waitBookkeeperTimeout | toString) "0")) }}
# This initContainer will wait for bookkeeper initnewcluster to complete
# before deploying the bookies
- name: pulsar-bookkeeper-verify-clusterid
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.autorecovery "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.autorecovery "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.autorecovery.waitBookkeeperTimeout }}", "sh", "-c"]
args:
- >
{{- include "pulsar.autorecovery.init.verify_cluster_id" . | nindent 10 }}
Expand All @@ -121,6 +122,7 @@ spec:
name: "{{ template "pulsar.fullname" . }}-{{ .Values.autorecovery.component }}"
volumeMounts:
{{- include "pulsar.autorecovery.certs.volumeMounts" . | nindent 8 }}
{{- end }}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.autorecovery.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.autorecovery "root" .) }}"
Expand Down
12 changes: 7 additions & 5 deletions charts/pulsar/templates/bookkeeper-cluster-initialize.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,25 @@ spec:
{{ toYaml .Values.pulsar_metadata.tolerations | indent 8 }}
{{- end }}
initContainers:
{{- if and .Values.bookkeeper.metadata.waitZookeeperTimeout (not (eq (.Values.bookkeeper.metadata.waitZookeeperTimeout | toString) "0")) }}
- name: wait-zookeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.bookie "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.bookkeeper.metadata.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
{{- if $zk:=.Values.pulsar_metadata.userProvidedZookeepers }}
export PULSAR_MEM="-Xmx128M";
until bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
echo "user provided zookeepers {{ $zk }} are unreachable... check in 3 seconds ..." && sleep 3;
done;
{{ else }}
until nslookup {{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }}-{{ add (.Values.zookeeper.replicaCount | int) -1 }}.{{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }}.{{ template "pulsar.namespace" . }}; do
sleep 3;
done;
{{- end}}
{{- end}}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.bookkeeper.component }}-init"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
Expand All @@ -70,17 +72,17 @@ spec:
resources:
{{ toYaml .Values.bookkeeper.metadata.resources | indent 10 }}
{{- end }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.bookkeeper.metadata.initTimeout | default 60 }}", "sh", "-c"]
args:
- >
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.toolset.zookeeper.tls.settings" . | nindent 12 }}
export BOOKIE_MEM="-Xmx128M";
if bin/bookkeeper shell whatisinstanceid; then
if timeout 15 bin/bookkeeper shell whatisinstanceid; then
echo "bookkeeper cluster already initialized";
else
{{- if not (eq .Values.metadataPrefix "") }}
bin/bookkeeper org.apache.zookeeper.ZooKeeperMain -server {{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }} create {{ .Values.metadataPrefix }} && echo 'created for pulsar cluster "{{ template "pulsar.cluster.name" . }}"' &&
bin/pulsar zookeeper-shell -server {{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }} create {{ .Values.metadataPrefix }} && echo 'created for pulsar cluster "{{ template "pulsar.cluster.name" . }}"' &&
{{- end }}
bin/bookkeeper shell initnewcluster;
fi
Expand Down
4 changes: 3 additions & 1 deletion charts/pulsar/templates/bookkeeper-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,15 @@ spec:
securityContext:
{{ toYaml .Values.bookkeeper.securityContext | indent 8 }}
{{- end }}
{{- if and .Values.bookkeeper.waitMetadataTimeout (not (eq (.Values.bookkeeper.waitMetadataTimeout | toString) "0")) }}
initContainers:
# This initContainer will wait for bookkeeper initnewcluster to complete
# before deploying the bookies
- name: pulsar-bookkeeper-verify-clusterid
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.bookie "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.bookkeeper.waitMetadataTimeout }}", "sh", "-c"]
args:
# only reformat bookie if bookkeeper is running without persistence
- >
Expand All @@ -127,6 +128,7 @@ spec:
{{- end}}
volumeMounts:
{{- include "pulsar.bookkeeper.certs.volumeMounts" . | nindent 8 }}
{{- end}}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.bookkeeper.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
Expand Down
18 changes: 11 additions & 7 deletions charts/pulsar/templates/broker-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,23 @@ spec:
{{- end }}
terminationGracePeriodSeconds: {{ .Values.broker.gracePeriod }}
initContainers:
{{- if and .Values.broker.waitZookeeperTimeout (not (eq (.Values.broker.waitZookeeperTimeout | toString) "0")) }}
# This init container will wait for zookeeper to be ready before
# deploying the bookies
- name: wait-zookeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.broker "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.broker "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.broker.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
{{- include "pulsar.broker.zookeeper.tls.settings" . | nindent 12 }}
export BOOKIE_MEM="-Xmx128M";
{{- if .Values.pulsar_metadata.configurationStore }}
until bin/bookkeeper org.apache.zookeeper.ZooKeeperMain -server {{ template "pulsar.configurationStore.connect" . }} get {{ .Values.configurationStoreMetadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.configurationStore.connect" . }} get {{ .Values.configurationStoreMetadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
{{- end }}
{{- if not .Values.pulsar_metadata.configurationStore }}
until bin/bookkeeper org.apache.zookeeper.ZooKeeperMain -server {{ template "pulsar.zookeeper.connect" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
{{- end }}
echo "pulsar cluster {{ template "pulsar.cluster.name" . }} isn't initialized yet ... check in 3 seconds ..." && sleep 3;
done;
Expand All @@ -146,19 +147,21 @@ spec:
{{- end }}
volumeMounts:
{{- include "pulsar.broker.certs.volumeMounts" . | nindent 8 }}
{{- end }}
{{- if and .Values.broker.waitBookkeeperTimeout (not (eq (.Values.broker.waitBookkeeperTimeout | toString) "0")) }}
# This init container will wait for bookkeeper to be ready before
# deploying the broker
- name: wait-bookkeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.broker "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.broker "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.broker.waitBookkeeperTimeout }}", "sh", "-c"]
args:
- >
{{- include "pulsar.broker.zookeeper.tls.settings" . | nindent 12 }}
bin/apply-config-from-env.py conf/bookkeeper.conf;
export BOOKIE_MEM="-Xmx128M";
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
echo "bookkeeper cluster is not initialized yet. backoff for 3 seconds ...";
sleep 3;
done;
Expand All @@ -179,6 +182,7 @@ spec:
{{- end }}
volumeMounts:
{{- include "pulsar.broker.certs.volumeMounts" . | nindent 10 }}
{{- end }}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.broker.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.broker "root" .) }}"
Expand Down Expand Up @@ -227,11 +231,11 @@ spec:
bin/gen-yml-from-env.py conf/functions_worker.yml;
echo "OK" > "${statusFilePath:-status}";
{{- include "pulsar.broker.zookeeper.tls.settings" . | nindent 10 }}
bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
while [ $? -eq 0 ]; do
echo "broker {{ template "pulsar.broker.hostname" . }} znode still exists ... check in 10 seconds ...";
sleep 10;
bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
done;
cat conf/pulsar_env.sh;
OPTS="${OPTS} -Dlog4j2.formatMsgNoLookups=true" exec bin/pulsar broker;
Expand Down
12 changes: 8 additions & 4 deletions charts/pulsar/templates/proxy-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,32 +105,35 @@ spec:
terminationGracePeriodSeconds: {{ .Values.proxy.gracePeriod }}
serviceAccountName: "{{ template "pulsar.fullname" . }}-{{ .Values.proxy.component }}"
initContainers:
{{- if and .Values.proxy.waitZookeeperTimeout (not (eq (.Values.proxy.waitZookeeperTimeout | toString) "0")) }}
# This init container will wait for zookeeper to be ready before
# deploying the bookies
- name: wait-zookeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.proxy "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.proxy "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.proxy.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
export PULSAR_MEM="-Xmx128M";
{{- if $zk:=.Values.pulsar_metadata.userProvidedZookeepers }}
until bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
echo "user provided zookeepers {{ $zk }} are unreachable... check in 3 seconds ..." && sleep 3;
done;
{{ else }}
until bin/pulsar zookeeper-shell -server {{ template "pulsar.configurationStore.service" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.configurationStore.service" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
sleep 3;
done;
{{- end}}
{{- end}}
{{- if and .Values.proxy.waitBrokerTimeout (not (eq (.Values.proxy.waitBrokerTimeout | toString) "0")) }}
# This init container will wait for at least one broker to be ready before
# deploying the proxy
- name: wait-broker-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.proxy "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.proxy "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.proxy.waitBrokerTimeout }}", "sh", "-c"]
args:
- >-
set -e;
Expand All @@ -140,6 +143,7 @@ spec:
sleep 10;
brokerServiceNumber="$(nslookup -timeout=10 {{ template "pulsar.fullname" . }}-{{ .Values.broker.component }} | grep Name | wc -l)";
done;
{{- end}}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.proxy.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.proxy "root" .) }}"
Expand Down
16 changes: 10 additions & 6 deletions charts/pulsar/templates/pulsar-cluster-initialize.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@ spec:
{{ toYaml .Values.pulsar_metadata.nodeSelector | indent 8 }}
{{- end }}
initContainers:
{{- if and .Values.pulsar_metadata.waitZookeeperTimeout (not (eq (.Values.pulsar_metadata.waitZookeeperTimeout | toString) "0")) }}
{{- if .Values.pulsar_metadata.configurationStore }}
- name: wait-cs-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
until nslookup {{ .Values.pulsar_metadata.configurationStore}}; do
Expand All @@ -57,41 +58,44 @@ spec:
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
{{- if $zk:=.Values.pulsar_metadata.userProvidedZookeepers }}
export PULSAR_MEM="-Xmx128M";
until bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
echo "user provided zookeepers {{ $zk }} are unreachable... check in 3 seconds ..." && sleep 3;
done;
{{ else }}
until nslookup {{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }}-{{ add (.Values.zookeeper.replicaCount | int) -1 }}.{{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }}.{{ template "pulsar.namespace" . }}; do
sleep 3;
done;
{{- end}}
{{- end }}
{{- if and .Values.pulsar_metadata.waitBookkeeperTimeout (not (eq (.Values.pulsar_metadata.waitBookkeeperTimeout | toString) "0")) }}
# This initContainer will wait for bookkeeper initnewcluster to complete
# before initializing pulsar metadata
- name: pulsar-bookkeeper-verify-clusterid
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.waitBookkeeperTimeout }}", "sh", "-c"]
args:
- >
bin/apply-config-from-env.py conf/bookkeeper.conf;
echo Default BOOKIE_MEM settings are set very high, which can cause the init container to fail.;
echo Setting the memory to a lower value to avoid OOM as operations below are not memory intensive.;
export BOOKIE_MEM="-Xmx128M";
{{- include "pulsar.toolset.zookeeper.tls.settings" . | nindent 10 }}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
envFrom:
- configMapRef:
name: "{{ template "pulsar.fullname" . }}-{{ .Values.bookkeeper.component }}"
volumeMounts:
{{- include "pulsar.toolset.certs.volumeMounts" . | nindent 8 }}
{{- end }}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.pulsar_metadata.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
Expand All @@ -100,7 +104,7 @@ spec:
resources:
{{ toYaml .Values.pulsar_metadata.resources | indent 10 }}
{{- end }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.initTimeout | default 60 }}", "sh", "-c"]
args:
- |
{{- include "pulsar.toolset.zookeeper.tls.settings" . | nindent 12 }}
Expand Down
Loading

0 comments on commit 70f36ff

Please sign in to comment.