mesosphere · dlipovetsky · Oct 18, 2023 · Oct 11, 2023 · Oct 13, 2023 · supershal
diff --git a/controllers/cluster_scripts/cloud_init.tmpl b/controllers/cluster_scripts/cloud_init.tmpl
@@ -3,80 +3,87 @@ users:
   - name: root
     lock_passwd: false
 write_files:
-- path: /root/ {{- if .ControlPlane -}} control_plane {{- else -}} node {{- end -}} .sh
+# On first boot, cloud-init writes all files defined in userdata. At the same time,
+# VMware Guest Customization configures networking, and reboots the machine when it is done.
+# Any files in /run are not preserved. We need cloud-init to fetch userdata and write the
+# files again. We clear the cloud-init cache, and reboot. Cloud-init thinks it is the
+# first boot, and fetches the userdata, and writes the files.
+- path: /root/replace-userdata-files.sh
   owner: root
   content: |
     #!/usr/bin/env bash
-    catch() {
-      vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?"
-      ERROR_MESSAGE="$(date) $(caller): $BASH_COMMAND"
-      echo "$ERROR_MESSAGE" &>> /var/log/capvcd/customization/error.log
-      if [[ -s /root/kubeadm.err ]]
-      then
-        KUBEADM_FAILURE=$(cat /root/kubeadm.err)
-        ERROR_MESSAGE="$ERROR_MESSAGE $KUBEADM_FAILURE"
-      fi
-      vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE"
+    function _log() {
+      echo "$(date -u +"%Y-%m-%d %H:%M:%S") $@" >> /var/log/capvcd/replace-userdata-files.log
     }
-    mkdir -p /var/log/capvcd/customization
-    trap 'catch $? $LINENO' ERR EXIT
-    set -eEx
 
-    echo "$(date) Post Customization script execution in progress" &>> /var/log/capvcd/customization/status.log {{- if .ControlPlane }}
+    mkdir -p /var/log/capvcd
 
-    VCLOUD_BASIC_AUTH_PATH=/root/vcloud-basic-auth.yaml
-    VCLOUD_CONFIGMAP_PATH=/root/vcloud-configmap.yaml
-    VCLOUD_CCM_PATH=/root/cloud-director-ccm.yaml
-    VCLOUD_CSI_CONFIGMAP_PATH=/root/vcloud-csi-configmap.yaml
-    CSI_DRIVER_PATH=/root/csi-driver.yaml
-    CSI_CONTROLLER_PATH=/root/csi-controller.yaml
-    CSI_NODE_PATH=/root/csi-node.yaml {{- end }}
+    _log "Checking for kubeadm configuration file"
+    if [ -f /run/kubeadm/kubeadm.yaml ] || [ -f /run/kubeadm/kubeadm-join-config.yaml ]; then
+      _log "kubeadm configuration file found, exiting"
+      exit 0
+    fi
+    _log "kubeadm configuration file not found, cleaning cloud-init cache and rebooting"
+    cloud-init clean
+    reboot
+- path: /root/bootstrap.sh
+  owner: root
+  content: |
+    #!/usr/bin/env bash
 
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status in_progress"
-    hostname "{{ .MachineName }}"
-    echo "::1         ipv6-localhost ipv6-loopback" >/etc/hosts
-    echo "127.0.0.1   localhost" >>/etc/hosts
-    echo "{{ .MachineName }}" >/etc/hostname
-    echo "127.0.0.1" `hostname` >>/etc/hosts
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful"
+    mkdir -p /var/log/capvcd
+    (
+      # Prefix timestamp to commands in trace output.
+      PS4='$(date -u +"%Y-%m-%d %H:%M:%S")\011'
+      set -o xtrace
 
-    vmtoolsd --cmd "info-set guestinfo.metering.status in_progress"
-    vmtoolsd --cmd "info-set guestinfo.metering.status successful"
+      # Exit on the first error. Does not apply to commad lists, or pipelines.
+      set -o errexit
 
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status in_progress"
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful"
+      # Our images do not require any network customization,
+      # but CAPVCD requires a successful status to finish bootstrapping.
+      vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful"
 
-    vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress"
-    {{ .BootstrapRunCmd }}
-    if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]
-    then
-      echo "file /run/cluster-api/bootstrap-success.complete not found" &>> /var/log/capvcd/customization/error.log
-      exit 1
-    fi
-    vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful"
+      # Our images do not ship the VCD metering service,
+      # but CAPVCD requires a successful status to finish bootstrapping.
+      vmtoolsd --cmd "info-set guestinfo.metering.status successful"
+
+      vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress"
+
+      # Run the preKubeadmCommands, and then kubeadm itself.
+      {{ .BootstrapRunCmd }}
+
+      # Kubeadm is the first command in a bash "list of commands," and its failure
+      # does not cause this subshell to exit. Therefore, we check the "sentinel" also created
+      # in the "list of commands," and exit if it is missing.
+      if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]; then
+        echo "file /run/cluster-api/bootstrap-success.complete not found"
+        exit 1
+      fi
+
+      vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful"
+
+      exit 0
+    ) &>> /var/log/capvcd/bootstrap.log
+    bootstrap_exit_code=$?
+
+    # Write the exit code to the VM metadata.
+    vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $bootstrap_exit_code"
+
+    # Use the last lines of the bootstrap log to give context about any failure.
+    TAIL_LOG="$(tail --lines=10 /var/log/capvcd/bootstrap.log)"
+    vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $TAIL_LOG"
 
-    echo "$(date) post customization script execution completed" &>> /var/log/capvcd/customization/status.log
-    exit 0
+    # Write cloud-init output for additional context.
+    vmtoolsd --cmd "info-set guestinfo.post_customization_cloud_init_output $(</var/log/cloud-init-output.log)"
 runcmd:
-- 'sudo cloud-init clean --seed --logs'
-- 'sudo cat /dev/null > /var/log/cloud-init-output.log'
-{{ if .ControlPlane }}
-- '[ ! -f /run/kubeadm/konvoy-set-kube-proxy-configuration.sh] && sudo reboot'
 {{ if .ControlPlane }} 
 - '[ ! -f /root/control_plane.sh ] && sudo reboot' 
 - '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot' 
 - bash /root/control_plane.sh 
 {{ else }} 
 - '[ ! -f /root/node.sh ] && sudo reboot' 
 - '[ ! -f /run/kubeadm/kubeadm-join-config.yaml ] && sudo reboot' 
 - bash /root/node.sh 
 {{ end }} 
 {{ if .ControlPlane }} 
 - '[ ! -f /root/control_plane.sh ] && sudo reboot' 
 - '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot' 
 - bash /root/control_plane.sh 
 {{ else }} 
 - '[ ! -f /root/node.sh ] && sudo reboot' 
 - '[ ! -f /run/kubeadm/kubeadm-join-config.yaml ] && sudo reboot' 
 - bash /root/node.sh 
 {{ end }} 
-- '[ ! -f /run/konvoy/containerd-apply-patches.sh] && sudo reboot'
-- '[ ! -f /run/konvoy/restart-containerd-and-wait.sh] && sudo reboot'
-- '[ ! -f /root/control_plane.sh ] && sudo reboot'
-- '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot'
-- bash /root/control_plane.sh
-{{ else }}
-- '[ ! -f /root/node.sh ] && sudo reboot'
-- '[ ! -f /run/kubeadm/kubeadm-join-config.yaml ] && sudo reboot'
-- bash /root/node.sh
-{{ end }}
+- bash /root/replace-userdata-files.sh
+- bash /root/bootstrap.sh
 timezone: UTC
 disable_root: false
-disable_vmware_customization: true
-network:
-  config: disabled
+# Ensure we have an IPv4 address for localhost
+manage_etc_hosts: localhost
+# Ensure that cloud-init can override the hostname.
 preserve_hostname: false
 hostname: "{{ .MachineName }}"
-final_message: "The system is ready after $UPTIME seconds"
+final_message: "The system is ready after $UPTIME seconds"