diff --git a/config.yaml b/config.yaml index 9e6de507..8785db1d 100644 --- a/config.yaml +++ b/config.yaml @@ -152,6 +152,19 @@ options: description: | Space-separated list of extra SAN entries to add to the x509 certificate created for the control plane nodes. + ha-cluster-vip: + type: string + description: | + Virtual IP for the charm to use with the HA Cluster subordinate charm + Mutually exclusive with ha-cluster-dns. Multiple virtual IPs are + separated by spaces. + default: "" + ha-cluster-dns: + type: string + description: | + DNS entry to use with the HA Cluster subordinate charm. + Mutually exclusive with ha-cluster-vip. + default: "" image-registry: type: string default: "rocks.canonical.com:443/cdk" diff --git a/metadata.yaml b/metadata.yaml index ac5129a5..358b58e2 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -45,6 +45,8 @@ requires: interface: kube-dns etcd: interface: etcd + ha: + interface: hacluster loadbalancer-external: # Indicates that the LB should be public facing. Intended for clients which # must reach the API server via external networks. diff --git a/requirements.txt b/requirements.txt index 438f8f06..66c6068d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ charm-lib-node-base @ git+https://github.com/charmed-kubernetes/layer-kubernetes charm-lib-reconciler @ git+https://github.com/charmed-kubernetes/charm-lib-reconciler cosl == 0.0.7 gunicorn >= 20.0.0,<21.0.0 +interface_hacluster @ git+https://github.com/openstack/charm-interface-hacluster jinja2 loadbalancer_interface ops >= 2.2.0 diff --git a/src/charm.py b/src/charm.py index bd2a1c70..ab4dfd4f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -29,10 +29,11 @@ from charms.node_base import LabelMaker from charms.reconciler import Reconciler from cos_integration import COSIntegration +from hacluster import HACluster from k8s_api_endpoints import K8sApiEndpoints from kubectl import kubectl from loadbalancer_interface import LBProvider -from ops import BlockedStatus, ModelError, WaitingStatus +from ops import BlockedStatus, MaintenanceStatus, ModelError, WaitingStatus from ops.interface_kube_control import KubeControlProvides from ops.interface_tls_certificates import CertificatesRequires @@ -64,6 +65,7 @@ def __init__(self, *args): ) self.etcd = EtcdReactiveRequires(self) self.node_base = LabelMaker(self, kubeconfig_path="/root/.kube/config") + self.hacluster = HACluster(self, self.config) self.k8s_api_endpoints = K8sApiEndpoints(self) self.kube_control = KubeControlProvides(self, endpoint="kube-control") self.kube_dns = KubeDnsRequires(self, endpoint="dns-provider") @@ -72,6 +74,7 @@ def __init__(self, *args): self.external_cloud_provider = ExternalCloudProvider(self, "external-cloud-provider") self.reconciler = Reconciler(self, self.reconcile) self.tokens = TokensProvider(self, endpoint="tokens") + self.framework.observe(self.on.update_status, self.update_status) def api_dependencies_ready(self): common_name = kubernetes_snaps.get_public_address() @@ -143,6 +146,17 @@ def configure_controller_manager(self): external_cloud_provider=self.external_cloud_provider, ) + def configure_hacluster(self): + if self.hacluster.is_ready: + status.add(MaintenanceStatus("Configuring HACluster")) + self.hacluster.update_vips() + self.hacluster.configure_hacluster() + # Note that we do not register any systemd services with HACluster. + # We used to register the Kubernetes control plane services, but + # that meant Pacemaker would take over managing the services, and + # often would not start them when it should. Long history of bugs + # there. + def configure_kernel_parameters(self): sysctl = yaml.safe_load(self.model.config["sysctl"]) kubernetes_snaps.configure_kernel_parameters(sysctl) @@ -429,6 +443,7 @@ def reconcile(self, event): self.configure_kubelet() self.configure_kube_proxy() self.configure_kube_control() + self.configure_hacluster() self.generate_tokens() self.configure_observability() @@ -443,6 +458,11 @@ def request_certificates(self): bind_addrs = kubernetes_snaps.get_bind_addresses() common_name = kubernetes_snaps.get_public_address() + config_addrs = [ + address + for option in ["loadbalancer-ips", "ha-cluster-vip", "ha-cluster-dns"] + for address in self.config[option].split() + ] domain = self.get_dns_domain() extra_sans = self.config["extra_sans"].split() k8s_service_addrs = kubernetes_snaps.get_kubernetes_service_addresses( @@ -464,6 +484,7 @@ def request_certificates(self): f"kubernetes.default.svc.{domain}", ] sans += bind_addrs + sans += config_addrs sans += ingress_addrs sans += k8s_service_addrs sans += extra_sans @@ -472,6 +493,16 @@ def request_certificates(self): self.certificates.request_client_cert("system:kube-apiserver") self.certificates.request_server_cert(cn=common_name, sans=sans) + def update_status(self, event): + if self.hacluster.is_ready: + apiserver_running = ( + subprocess.call(["systemctl", "is-active", "snap.kube-apiserver.daemon"]) == 0 + ) + if apiserver_running: + self.hacluster.set_node_online() + else: + self.hacluster.set_node_standby() + def write_service_account_key(self): peer_relation = self.model.get_relation("peer") key = peer_relation.data[self.app].get("service-account-key") diff --git a/src/hacluster.py b/src/hacluster.py new file mode 100644 index 00000000..2205fd92 --- /dev/null +++ b/src/hacluster.py @@ -0,0 +1,152 @@ +"""HACluster integration module.""" + +import logging +import subprocess +from typing import List, Optional + +import ops +from cached_property import cached_property +from interface_hacluster.ops_ha_interface import HAServiceRequires +from ops.framework import Object, StoredState +from ops.model import Relation + +log = logging.getLogger(__name__) + + +class HAClusterConfigMismatchError(Exception): + """A custom exception to represent a HA cluster config conflict.""" + + def __init__(self, message): + super().__init__(message) + self.message = message + + +class HACluster(Object): + """A class for integrate HA in the charm.""" + + state = StoredState() + + def __init__(self, charm: ops.CharmBase, config, endpoint="ha"): + super().__init__(charm, f"relation-{endpoint}") + self.charm = charm + self.endpoint = endpoint + self.config = config + self.interface = HAServiceRequires(self.charm, endpoint) + + self.state.set_default( + current_services={}, desired_services={}, deleted_services={}, vips=set(), dns=set() + ) + + def _configure_dns(self, dns_records: List[str]): + binding = self.charm.model.get_binding(self.endpoint) + address = binding.network.ingress_address + for dns_record in dns_records: + self.interface.add_dnsha(self._unit_name, address, dns_record, "public") + + self.state.dns = set(dns_records) + + def _configure_vips(self, vips: List[str]): + for vip in vips: + self.interface.add_vip(self._unit_name, vip) + self.state.vips = set(vips) + + @cached_property + def _unit_name(self): + """Return the name of the unit.""" + return self.charm.unit.name.split("/")[0] + + def _update_services(self): + """Update the systemd services.""" + current_services = self.state.current_services + deleted_services = self.state.deleted_services + desired_services = self.state.desired_services + + for name, service in deleted_services.items(): + self.interface.remove_systemd_service(name, service) + + for name, service in desired_services.items(): + self.interface.add_systemd_service(name, service) + current_services[name] = service + + deleted_services.clear() + desired_services.clear() + + def add_service(self, name, service_name): + """Add a service to the desired services in the HA cluster. + + Args: + name (str): The key name of the service. + service_name (str): The name of the service to be added. + """ + current_services = self.state.current_services + if name not in current_services: + self.state.desired_services[name] = service_name + + def configure_hacluster(self): + """Configure the HACluster relation with VIPs of DNS records.""" + vips = self.config.get("ha-cluster-vip").split() + dns_records = self.config.get("ha-cluster-dns").split() + if vips and dns_records: + msg = "Unsupported config. ha-cluster-vip and ha-cluster-dns cannot both be set." + log.warning(msg) + raise HAClusterConfigMismatchError(msg) + if vips: + self._configure_vips(vips) + elif dns_records: + self._configure_dns(dns_records) + + self._update_services() + + self.interface.bind_resources() + + @property + def is_ready(self): + """Check if the HACluster integration is ready. + + Returns: + bool: True if the HACluster relation is ready, False otherwise. + """ + if self.relation and self.relation.units: + return True + return False + + @property + def relation(self) -> Optional[Relation]: + """Get the HACluster relation.""" + return self.model.get_relation(self.endpoint) + + def remove_service(self, name, service_name): + """Remove a service from the desired services in the HA cluster. + + Args: + name (str): The key name of the service. + service_name (str): The name of the service to be removed. + """ + current_services = self.state.current_services + deleted_services = self.state.deleted_services + desired_services = self.state.desired_services + + if name in current_services: + deleted_services[name] = service_name + + if name in desired_services: + del desired_services[name] + + def set_node_online(self): + """Set pacemaker node to online.""" + log.info("Setting pacemaker node status to online") + subprocess.check_call(["crm", "-w", "-F", "node", "online"]) + + def set_node_standby(self): + """Set pacemaker node to standby, forcing VIPs to failover to other nodes.""" + log.warning("Setting pacemaker node status to standby") + subprocess.check_call(["crm", "-w", "-F", "node", "standby"]) + + def update_vips(self): + """Update the Virtual IP addresses for the HACluster relation.""" + original_vips = self.state.vips + new_vips = set(self.config.get("ha-cluster-vip").split()) + old_vips = original_vips - new_vips + + for vip in old_vips: + self.interface.remove_vip(self._unit_name, vip) diff --git a/src/k8s_api_endpoints.py b/src/k8s_api_endpoints.py index cba93c55..4575390e 100644 --- a/src/k8s_api_endpoints.py +++ b/src/k8s_api_endpoints.py @@ -11,13 +11,24 @@ def __init__(self, charm): self.charm = charm def from_config(self) -> Optional[str]: - """Endpoint URLs from the loadbalancer-ips config option. + """Endpoint URL from charm configuration. Usually an IP address. Could be a domain name. + + If the loadbalancer-ips config option is set, use that first. + + Otherwise, if we are integrated with hacluster, then build an endpoint + from the ha-cluster-vip or ha-cluster-dns configs. """ - addresses = self.charm.model.config["loadbalancer-ips"].split() + addresses = self.charm.config["loadbalancer-ips"].split() if addresses: - return build_url(addresses[0]) + return build_url(addresses[0], 6443) + + if self.charm.hacluster.is_ready: + for key in ["ha-cluster-vip", "ha-cluster-dns"]: + addresses = self.charm.config[key].split() + if addresses: + return build_url(addresses[0], 6443) def from_lb_external(self) -> Optional[str]: """Endpoint URL from the loadbalancer-external relation."""