Skip to content

Commit

Permalink
Merge pull request kubernetes#42555 from nicksardo/upgrade-ingress-flake
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue (batch tested with PRs 42080, 41653, 42598, 42555)

Fix resource cleanup in ingress_utils.go within e2e/framework

**What this PR does / why we need it**:
The GLBC is failing to delete resources during the etcd rollback tests and the e2e cleanup is leaking them. After a short while, tests are failing to create new resources. 
This PR addresses the e2e/framework's ability to delete GLBC-created resources and adds more logging.

**Which issue this PR fixes**:
Helps kubernetes#38569 but does not completely close this flake

**Special notes for your reviewer**:
Resources were not being deleted because resource names were being truncated and then their ability to be deleted was determined by the entire cluster id existing in the name. Truncated names also have an extra '0' append to the end of their name (unknown origin). This PR tries to match on a common prefix. 

Minor changes were made to improve log readability. 

**Testing this PR**:
This was tested by running a master upgrade test and by adding a second forwarding-rule mid-run.  This forwarding rule referenced the same url-map used by the first forwarding-rule created by the GLBC. Therefore, the GLBC will be able to delete the forwarding-rule but not anymore L7 resources. This second forwarding rule's name was nearly identical to the first forwarding rule so that the cleanup code will find it. 

As you can see from the test run below, the cleanup code deleted all the resources that the GLBC could not.

```log
...
Mar  5 18:35:53.112: INFO: Monitoring glbc's cleanup of gce resources:
k8s-fws-e2e-tests-ingress-upgrsde-0px85-static-ip--5f38ac0e2420 (forwarding rule)
k8s-tps-e2e-tests-ingress-upgrade-0px85-static-ip--5f38ac0e2420 (target-https-proxy)
k8s-um-e2e-tests-ingress-upgrade-0px85-static-ip--5f38ac0e24260 (url-map)
k8s-be-32331--5f38ac0e2426f796 (backend-service)
k8s-be-32613--5f38ac0e2426f796 (backend-service)
k8s-be-32331--5f38ac0e2426f796 (http-health-check)
k8s-be-32613--5f38ac0e2426f796 (http-health-check)
k8s-ig--5f38ac0e2426f796 (instance-group)
k8s-ssl-e2e-tests-ingress-upgrade-0px85-static-ip--5f38ac0e2420 (ssl-certificate)

STEP: Performing final delete of any remaining resources
Mar  5 18:35:54.055: INFO: Deleting forwarding-rules: k8s-fws-e2e-tests-ingress-upgrsde-0px85-static-ip--5f38ac0e2420
Mar  5 18:36:06.945: INFO: Deleting target-https-proxies: k8s-tps-e2e-tests-ingress-upgrade-0px85-static-ip--5f38ac0e2420
Mar  5 18:36:14.301: INFO: Deleting url-map: k8s-um-e2e-tests-ingress-upgrade-0px85-static-ip--5f38ac0e24260
Mar  5 18:36:18.309: INFO: Deleting backed-service: k8s-be-32331--5f38ac0e2426f796
Mar  5 18:36:22.112: INFO: Deleting backed-service: k8s-be-32613--5f38ac0e2426f796
Mar  5 18:36:26.192: INFO: Deleting http-health-check: k8s-be-32331--5f38ac0e2426f796
Mar  5 18:36:29.846: INFO: Deleting http-health-check: k8s-be-32613--5f38ac0e2426f796
Mar  5 18:36:33.722: INFO: Deleting instance-group: k8s-ig--5f38ac0e2426f796
Mar  5 18:36:37.762: INFO: Deleting ssl-certificate: k8s-ssl-e2e-tests-ingress-upgrade-0px85-static-ip--5f38ac0e2420
STEP: No resources leaked.
Mar  5 18:36:46.441: INFO: Deleting addresses: e2e-tests-ingress-upgrade-0px85-static-ip
Mar  5 18:36:53.902: INFO: L7 controller failed to delete all cloud resources on time. timed out waiting for the condition
...
```
  • Loading branch information
Kubernetes Submit Queue authored Mar 7, 2017
2 parents 8e52bec + 6bba665 commit 20c13c1
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 35 deletions.
81 changes: 53 additions & 28 deletions test/e2e/framework/ingress_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ const (
// Cloud resources created by the ingress controller older than this
// are automatically purged to prevent running out of quota.
// TODO(37335): write soak tests and bump this up to a week.
maxAge = -48 * time.Hour
maxAge = 48 * time.Hour

// IngressManifestPath is the parent path to yaml test manifests.
IngressManifestPath = "test/e2e/testing-manifests/ingress"
Expand Down Expand Up @@ -335,17 +335,26 @@ func createIngressTLSSecret(kubeClient clientset.Interface, ing *extensions.Ingr
func CleanupGCEIngressController(gceController *GCEIngressController) {
pollErr := wait.Poll(5*time.Second, LoadBalancerCleanupTimeout, func() (bool, error) {
if err := gceController.Cleanup(false); err != nil {
Logf("Still waiting for glbc to cleanup:\n%v", err)
Logf("Monitoring glbc's cleanup of gce resources:\n%v", err)
return false, nil
}
return true, nil
})

// Always try to cleanup even if pollErr == nil, because the cleanup
// routine also purges old leaked resources based on creation timestamp.
By("Performing final delete of any remaining resources")
if cleanupErr := gceController.Cleanup(true); cleanupErr != nil {
By(fmt.Sprintf("WARNING: possibly leaked resources: %v\n", cleanupErr))
} else {
By("No resources leaked.")
}

// Static-IP allocated on behalf of the test, never deleted by the
// controller. Delete this IP only after the controller has had a chance
// to cleanup or it might interfere with the controller, causing it to
// throw out confusing events.
if ipErr := wait.Poll(5*time.Second, LoadBalancerCleanupTimeout, func() (bool, error) {
if ipErr := wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
if err := gceController.deleteStaticIPs(); err != nil {
Logf("Failed to delete static-ip: %v\n", err)
return false, nil
Expand All @@ -357,14 +366,6 @@ func CleanupGCEIngressController(gceController *GCEIngressController) {
By(fmt.Sprintf("WARNING: possibly leaked static IP: %v\n", ipErr))
}

// Always try to cleanup even if pollErr == nil, because the cleanup
// routine also purges old leaked resources based on creation timestamp.
if cleanupErr := gceController.Cleanup(true); cleanupErr != nil {
By(fmt.Sprintf("WARNING: possibly leaked resources: %v\n", cleanupErr))
} else {
By("No resources leaked.")
}

// Fail if the controller didn't cleanup
if pollErr != nil {
Failf("L7 controller failed to delete all cloud resources on time. %v", pollErr)
Expand All @@ -383,9 +384,10 @@ func (cont *GCEIngressController) deleteForwardingRule(del bool) string {
if !cont.canDelete(f.Name, f.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (forwarding rule)\n", f.Name)
if del {
GcloudComputeResourceDelete("forwarding-rules", f.Name, cont.Cloud.ProjectID, "--global")
} else {
msg += fmt.Sprintf("%v (forwarding rule)\n", f.Name)
}
}
}
Expand All @@ -402,9 +404,10 @@ func (cont *GCEIngressController) deleteAddresses(del bool) string {
if !cont.canDelete(ip.Name, ip.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (static-ip)\n", ip.Name)
if del {
GcloudComputeResourceDelete("addresses", ip.Name, cont.Cloud.ProjectID, "--global")
} else {
msg += fmt.Sprintf("%v (static-ip)\n", ip.Name)
}
}
}
Expand All @@ -421,9 +424,10 @@ func (cont *GCEIngressController) deleteTargetProxy(del bool) string {
if !cont.canDelete(t.Name, t.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (target-http-proxy)\n", t.Name)
if del {
GcloudComputeResourceDelete("target-http-proxies", t.Name, cont.Cloud.ProjectID)
} else {
msg += fmt.Sprintf("%v (target-http-proxy)\n", t.Name)
}
}
}
Expand All @@ -435,9 +439,10 @@ func (cont *GCEIngressController) deleteTargetProxy(del bool) string {
if !cont.canDelete(t.Name, t.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (target-https-proxy)\n", t.Name)
if del {
GcloudComputeResourceDelete("target-https-proxies", t.Name, cont.Cloud.ProjectID)
} else {
msg += fmt.Sprintf("%v (target-https-proxy)\n", t.Name)
}
}
}
Expand All @@ -460,12 +465,14 @@ func (cont *GCEIngressController) deleteURLMap(del bool) (msg string) {
if !cont.canDelete(um.Name, um.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (url-map)\n", um.Name)
if del {
Logf("Deleting url-map: %s", um.Name)
if err := gceCloud.DeleteUrlMap(um.Name); err != nil &&
!cont.isHTTPErrorCode(err, http.StatusNotFound) {
msg += fmt.Sprintf("Failed to delete url map %v\n", um.Name)
}
} else {
msg += fmt.Sprintf("%v (url-map)\n", um.Name)
}
}
return msg
Expand All @@ -488,12 +495,14 @@ func (cont *GCEIngressController) deleteBackendService(del bool) (msg string) {
if !cont.canDelete(be.Name, be.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (backend-service)\n", be.Name)
if del {
Logf("Deleting backed-service: %s", be.Name)
if err := gceCloud.DeleteBackendService(be.Name); err != nil &&
!cont.isHTTPErrorCode(err, http.StatusNotFound) {
msg += fmt.Sprintf("Failed to delete backend service %v\n", be.Name)
msg += fmt.Sprintf("Failed to delete backend service %v: %v\n", be.Name, err)
}
} else {
msg += fmt.Sprintf("%v (backend-service)\n", be.Name)
}
}
return msg
Expand All @@ -515,12 +524,14 @@ func (cont *GCEIngressController) deleteHTTPHealthCheck(del bool) (msg string) {
if !cont.canDelete(hc.Name, hc.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (http-health-check)\n", hc.Name)
if del {
Logf("Deleting http-health-check: %s", hc.Name)
if err := gceCloud.DeleteHttpHealthCheck(hc.Name); err != nil &&
!cont.isHTTPErrorCode(err, http.StatusNotFound) {
msg += fmt.Sprintf("Failed to delete HTTP health check %v\n", hc.Name)
}
} else {
msg += fmt.Sprintf("%v (http-health-check)\n", hc.Name)
}
}
return msg
Expand All @@ -540,12 +551,14 @@ func (cont *GCEIngressController) deleteSSLCertificate(del bool) (msg string) {
if !cont.canDelete(s.Name, s.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (ssl-certificate)\n", s.Name)
if del {
Logf("Deleting ssl-certificate: %s", s.Name)
if err := gceCloud.DeleteSslCertificate(s.Name); err != nil &&
!cont.isHTTPErrorCode(err, http.StatusNotFound) {
msg += fmt.Sprintf("Failed to delete ssl certificates: %v\n", s.Name)
}
} else {
msg += fmt.Sprintf("%v (ssl-certificate)\n", s.Name)
}
}
}
Expand All @@ -570,12 +583,14 @@ func (cont *GCEIngressController) deleteInstanceGroup(del bool) (msg string) {
if !cont.canDelete(ig.Name, ig.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (instance-group)\n", ig.Name)
if del {
Logf("Deleting instance-group: %s", ig.Name)
if err := gceCloud.DeleteInstanceGroup(ig.Name, cont.Cloud.Zone); err != nil &&
!cont.isHTTPErrorCode(err, http.StatusNotFound) {
msg += fmt.Sprintf("Failed to delete instance group %v\n", ig.Name)
}
} else {
msg += fmt.Sprintf("%v (instance-group)\n", ig.Name)
}
}
return msg
Expand All @@ -587,11 +602,21 @@ func (cont *GCEIngressController) deleteInstanceGroup(del bool) (msg string) {
// Ingress cloud resources.
func (cont *GCEIngressController) canDelete(resourceName, creationTimestamp string, delOldResources bool) bool {
// ignore everything not created by an ingress controller.
if !strings.HasPrefix(resourceName, k8sPrefix) || len(strings.Split(resourceName, clusterDelimiter)) != 2 {
splitName := strings.Split(resourceName, clusterDelimiter)
if !strings.HasPrefix(resourceName, k8sPrefix) || len(splitName) != 2 {
return false
}

// Resources created by the GLBC have a "0"" appended to the end if truncation
// occurred. Removing the zero allows the following match.
truncatedClusterUID := splitName[1]
if len(truncatedClusterUID) >= 1 && strings.HasSuffix(truncatedClusterUID, "0") {
truncatedClusterUID = truncatedClusterUID[:len(truncatedClusterUID)-1]
}

// always delete things that are created by the current ingress controller.
if strings.HasSuffix(resourceName, cont.UID) {
// Because of resource name truncation, this looks for a common prefix
if strings.HasPrefix(cont.UID, truncatedClusterUID) {
return true
}
if !delOldResources {
Expand All @@ -602,7 +627,7 @@ func (cont *GCEIngressController) canDelete(resourceName, creationTimestamp stri
Logf("WARNING: Failed to parse creation timestamp %v for %v: %v", creationTimestamp, resourceName, err)
return false
}
if createdTime.Before(time.Now().Add(maxAge)) {
if time.Since(createdTime) > maxAge {
Logf("%v created on %v IS too old", resourceName, creationTimestamp)
return true
}
Expand Down Expand Up @@ -632,9 +657,10 @@ func (cont *GCEIngressController) deleteFirewallRule(del bool) (msg string) {
if !cont.canDelete(f.Name, f.CreationTimestamp, del) {
continue
}
msg += fmt.Sprintf("%v (firewall rule)\n", f.Name)
if del {
GcloudComputeResourceDelete("firewall-rules", f.Name, cont.Cloud.ProjectID)
} else {
msg += fmt.Sprintf("%v (firewall rule)\n", f.Name)
}
}
}
Expand All @@ -648,8 +674,7 @@ func (cont *GCEIngressController) isHTTPErrorCode(err error, code int) bool {

// Cleanup cleans up cloud resources.
// If del is false, it simply reports existing resources without deleting them.
// It always deletes resources created through it's methods, like staticIP, even
// if del is false.
// If dle is true, it deletes resources it finds acceptable (see canDelete func).
func (cont *GCEIngressController) Cleanup(del bool) error {
// Ordering is important here because we cannot delete resources that other
// resources hold references to.
Expand Down Expand Up @@ -737,7 +762,7 @@ func gcloudComputeResourceList(resource, regex, project string, out interface{})
// so we only look at stdout.
command := []string{
"compute", resource, "list",
fmt.Sprintf("--regexp=%v", regex),
fmt.Sprintf("--regexp=%q", regex),
fmt.Sprintf("--project=%v", project),
"-q", "--format=json",
}
Expand Down
16 changes: 9 additions & 7 deletions test/e2e/upgrades/ingress.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type IngressUpgradeTest struct {
jig *framework.IngressTestJig
httpClient *http.Client
ip string
ipName string
}

// Setup creates a GLBC, allocates an ip, and an ingress resource,
Expand All @@ -59,12 +60,13 @@ func (t *IngressUpgradeTest) Setup(f *framework.Framework) {
t.httpClient = framework.BuildInsecureClient(framework.IngressReqTimeout)

// Allocate a static-ip for the Ingress, this IP is cleaned up via CleanupGCEIngressController
t.ip = t.gceController.CreateStaticIP(ns.Name)
t.ipName = fmt.Sprintf("%s-static-ip", ns.Name)
t.ip = t.gceController.CreateStaticIP(t.ipName)

// Create a working basic Ingress
By(fmt.Sprintf("allocated static ip %v: %v through the GCE cloud provider", ns.Name, t.ip))
By(fmt.Sprintf("allocated static ip %v: %v through the GCE cloud provider", t.ipName, t.ip))
jig.CreateIngress(filepath.Join(framework.IngressManifestPath, "static-ip"), ns.Name, map[string]string{
"kubernetes.io/ingress.global-static-ip-name": ns.Name,
"kubernetes.io/ingress.global-static-ip-name": t.ipName,
"kubernetes.io/ingress.allow-http": "false",
})

Expand Down Expand Up @@ -96,12 +98,12 @@ func (t *IngressUpgradeTest) Teardown(f *framework.Framework) {
if CurrentGinkgoTestDescription().Failed {
framework.DescribeIng(t.gceController.Ns)
}
if t.jig.Ingress == nil {
if t.jig.Ingress != nil {
By("Deleting ingress")
t.jig.DeleteIngress()
} else {
By("No ingress created, no cleanup necessary")
return
}
By("Deleting ingress")
t.jig.DeleteIngress()

By("Cleaning up cloud resources")
framework.CleanupGCEIngressController(t.gceController)
Expand Down

0 comments on commit 20c13c1

Please sign in to comment.