From ac3388b3222725c1b971a453fc0f56e286b1ea2a Mon Sep 17 00:00:00 2001 From: Daniel Lipovetsky <3445370+dlipovetsky@users.noreply.github.com> Date: Wed, 18 Oct 2023 10:03:50 -0700 Subject: [PATCH] feat: Update cloud-init customization (#11) * feat: Update cloud-init customization Changes relative to upstream: * Use shell script to clean cloud-init cache and reboot. * Fix error handling of bootstrap script. Do not interpret stderr output as an indicator of failure. Do not rely on trap and errexit, because it does not work for command lists. * Include last lines of output for error context. * Ensure we have an IPv4 address for localhost. * Remove unnecessary cloud-init configuration to preserve SSH host keys. Changes relative to our fork: * Do not remove cloud-init logs and seed on reboot * Do not truncate cloud-init-output.log on reboot * Do not report status of HTTP proxy configuration * Remove redundant commands (already executed as a result of being defined in `preKubeadmCommands`) * Do not disable VMware customization * Do not disable network configuration --- controllers/cluster_scripts/cloud_init.tmpl | 129 +++++++++++--------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/controllers/cluster_scripts/cloud_init.tmpl b/controllers/cluster_scripts/cloud_init.tmpl index ea5f93e19..d7823a0f9 100644 --- a/controllers/cluster_scripts/cloud_init.tmpl +++ b/controllers/cluster_scripts/cloud_init.tmpl @@ -3,80 +3,87 @@ users: - name: root lock_passwd: false write_files: -- path: /root/ {{- if .ControlPlane -}} control_plane {{- else -}} node {{- end -}} .sh +# On first boot, cloud-init writes all files defined in userdata. At the same time, +# VMware Guest Customization configures networking, and reboots the machine when it is done. +# Any files in /run are not preserved. We need cloud-init to fetch userdata and write the +# files again. We clear the cloud-init cache, and reboot. Cloud-init thinks it is the +# first boot, and fetches the userdata, and writes the files. +- path: /root/replace-userdata-files.sh owner: root content: | #!/usr/bin/env bash - catch() { - vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?" - ERROR_MESSAGE="$(date) $(caller): $BASH_COMMAND" - echo "$ERROR_MESSAGE" &>> /var/log/capvcd/customization/error.log - if [[ -s /root/kubeadm.err ]] - then - KUBEADM_FAILURE=$(cat /root/kubeadm.err) - ERROR_MESSAGE="$ERROR_MESSAGE $KUBEADM_FAILURE" - fi - vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE" + function _log() { + echo "$(date -u +"%Y-%m-%d %H:%M:%S") $@" >> /var/log/capvcd/replace-userdata-files.log } - mkdir -p /var/log/capvcd/customization - trap 'catch $? $LINENO' ERR EXIT - set -eEx - echo "$(date) Post Customization script execution in progress" &>> /var/log/capvcd/customization/status.log {{- if .ControlPlane }} + mkdir -p /var/log/capvcd - VCLOUD_BASIC_AUTH_PATH=/root/vcloud-basic-auth.yaml - VCLOUD_CONFIGMAP_PATH=/root/vcloud-configmap.yaml - VCLOUD_CCM_PATH=/root/cloud-director-ccm.yaml - VCLOUD_CSI_CONFIGMAP_PATH=/root/vcloud-csi-configmap.yaml - CSI_DRIVER_PATH=/root/csi-driver.yaml - CSI_CONTROLLER_PATH=/root/csi-controller.yaml - CSI_NODE_PATH=/root/csi-node.yaml {{- end }} + _log "Checking for kubeadm configuration file" + if [ -f /run/kubeadm/kubeadm.yaml ] || [ -f /run/kubeadm/kubeadm-join-config.yaml ]; then + _log "kubeadm configuration file found, exiting" + exit 0 + fi + _log "kubeadm configuration file not found, cleaning cloud-init cache and rebooting" + cloud-init clean + reboot +- path: /root/bootstrap.sh + owner: root + content: | + #!/usr/bin/env bash - vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status in_progress" - hostname "{{ .MachineName }}" - echo "::1 ipv6-localhost ipv6-loopback" >/etc/hosts - echo "127.0.0.1 localhost" >>/etc/hosts - echo "{{ .MachineName }}" >/etc/hostname - echo "127.0.0.1" `hostname` >>/etc/hosts - vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful" + mkdir -p /var/log/capvcd + ( + # Prefix timestamp to commands in trace output. + PS4='$(date -u +"%Y-%m-%d %H:%M:%S")\011' + set -o xtrace - vmtoolsd --cmd "info-set guestinfo.metering.status in_progress" - vmtoolsd --cmd "info-set guestinfo.metering.status successful" + # Exit on the first error. Does not apply to commad lists, or pipelines. + set -o errexit - vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status in_progress" - vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful" + # Our images do not require any network customization, + # but CAPVCD requires a successful status to finish bootstrapping. + vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful" - vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress" - {{ .BootstrapRunCmd }} - if [[ ! -f /run/cluster-api/bootstrap-success.complete ]] - then - echo "file /run/cluster-api/bootstrap-success.complete not found" &>> /var/log/capvcd/customization/error.log - exit 1 - fi - vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful" + # Our images do not ship the VCD metering service, + # but CAPVCD requires a successful status to finish bootstrapping. + vmtoolsd --cmd "info-set guestinfo.metering.status successful" + + vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress" + + # Run the preKubeadmCommands, and then kubeadm itself. + {{ .BootstrapRunCmd }} + + # Kubeadm is the first command in a bash "list of commands," and its failure + # does not cause this subshell to exit. Therefore, we check the "sentinel" also created + # in the "list of commands," and exit if it is missing. + if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]; then + echo "file /run/cluster-api/bootstrap-success.complete not found" + exit 1 + fi + + vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful" + + exit 0 + ) &>> /var/log/capvcd/bootstrap.log + bootstrap_exit_code=$? + + # Write the exit code to the VM metadata. + vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $bootstrap_exit_code" + + # Use the last lines of the bootstrap log to give context about any failure. + TAIL_LOG="$(tail --lines=10 /var/log/capvcd/bootstrap.log)" + vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $TAIL_LOG" - echo "$(date) post customization script execution completed" &>> /var/log/capvcd/customization/status.log - exit 0 + # Write cloud-init output for additional context. + vmtoolsd --cmd "info-set guestinfo.post_customization_cloud_init_output $( /var/log/cloud-init-output.log' -{{ if .ControlPlane }} -- '[ ! -f /run/kubeadm/konvoy-set-kube-proxy-configuration.sh] && sudo reboot' -- '[ ! -f /run/konvoy/containerd-apply-patches.sh] && sudo reboot' -- '[ ! -f /run/konvoy/restart-containerd-and-wait.sh] && sudo reboot' -- '[ ! -f /root/control_plane.sh ] && sudo reboot' -- '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot' -- bash /root/control_plane.sh -{{ else }} -- '[ ! -f /root/node.sh ] && sudo reboot' -- '[ ! -f /run/kubeadm/kubeadm-join-config.yaml ] && sudo reboot' -- bash /root/node.sh -{{ end }} +- bash /root/replace-userdata-files.sh +- bash /root/bootstrap.sh timezone: UTC disable_root: false -disable_vmware_customization: true -network: - config: disabled +# Ensure we have an IPv4 address for localhost +manage_etc_hosts: localhost +# Ensure that cloud-init can override the hostname. preserve_hostname: false hostname: "{{ .MachineName }}" -final_message: "The system is ready after $UPTIME seconds" \ No newline at end of file +final_message: "The system is ready after $UPTIME seconds"