Strip containerID from events to reduce spam

2017-06-13 18:04:19 -04:00 · 2017-06-13 18:04:19 -04:00 · a02f10fa3a
parent 0a1b7d94b4
commit a02f10fa3a
2 changed files with 41 additions and 21 deletions
--- a/pkg/kubelet/kuberuntime/kuberuntime_container.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go
@ -24,6 +24,7 @@ import (
 	"os"
 	"path/filepath"
 	"sort"
+	"strings"
 	"sync"
 	"time"

@ -45,6 +46,30 @@ import (
 	"k8s.io/kubernetes/pkg/util/tail"
 )

+// recordContainerEvent should be used by the runtime manager for all container related events.
+// it has sanity checks to ensure that we do not write events that can abuse our masters.
+// in particular, it ensures that a containerID never appears in an event message as that
+// is prone to causing a lot of distinct events that do not count well.
+// it replaces any reference to a containerID with the containerName which is stable, and is what users know.
+func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) {
+	ref, err := kubecontainer.GenerateContainerRef(pod, container)
+	if err != nil {
+		glog.Errorf("Can't make a ref to pod %q, container %v: %v", format.Pod(pod), container.Name, err)
+		return
+	}
+	eventMessage := message
+	if len(args) > 0 {
+		eventMessage = fmt.Sprintf(message, args...)
+	}
+	// this is a hack, but often the error from the runtime includes the containerID
+	// which kills our ability to deduplicate events.  this protection makes a huge
+	// difference in the number of unique events
+	if containerID != "" {
+		eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1)
+	}
+	m.recorder.Event(events.ToObjectReference(ref), eventType, reason, eventMessage)
+}
+
 // startContainer starts a container and returns a message indicates why it is failed on error.
 // It starts the container through the following steps:
 // * pull the image
@ -74,15 +99,16 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb

 	containerConfig, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef)
 	if err != nil {
-		m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create container with error: %v", err)
+		m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", err)
 		return "Generate Container Config Failed", err
 	}
 	containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig)
 	if err != nil {
-		m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create container with error: %v", err)
+		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", err)
 		return "Create Container Failed", err
 	}
-	m.recorder.Eventf(ref, v1.EventTypeNormal, events.CreatedContainer, "Created container with id %v", containerID)
+	m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, "Created container")
+
 	if ref != nil {
 		m.containerRefManager.SetRef(kubecontainer.ContainerID{
 			Type: m.runtimeName,
@ -93,11 +119,10 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
 	// Step 3: start the container.
 	err = m.runtimeService.StartContainer(containerID)
 	if err != nil {
-		m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToStartContainer,
-			"Failed to start container with id %v with error: %v", containerID, err)
+		m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", err)
 		return "Start Container Failed", err
 	}
-	m.recorder.Eventf(ref, v1.EventTypeNormal, events.StartedContainer, "Started container with id %v", containerID)
+	m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, "Started container")

 	// Symlink container logs to the legacy container log location for cluster logging
 	// support.
@ -120,8 +145,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
 		}
 		msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart)
 		if handlerErr != nil {
-			err := fmt.Errorf("PostStart handler: %v", handlerErr)
-			m.generateContainerEvent(kubeContainerID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
+			m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
 			m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil)
 			return "PostStart Hook Failed", err
 		}
@ -439,16 +463,6 @@ func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName strin
 	return cStatus
 }

-// generateContainerEvent generates an event for the container.
-func (m *kubeGenericRuntimeManager) generateContainerEvent(containerID kubecontainer.ContainerID, eventType, reason, message string) {
-	ref, ok := m.containerRefManager.GetRef(containerID)
-	if !ok {
-		glog.Warningf("No ref for container %q", containerID)
-		return
-	}
-	m.recorder.Event(events.ToObjectReference(ref), eventType, reason, message)
-}
-
 // executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes.
 func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 {
 	glog.V(3).Infof("Running preStop hook for container %q", containerID.String())
@ -460,7 +474,7 @@ func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID
 		defer utilruntime.HandleCrash()
 		if msg, err := m.runner.Run(containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil {
 			glog.Errorf("preStop hook for container %q failed: %v", containerSpec.Name, err)
-			m.generateContainerEvent(containerID, v1.EventTypeWarning, events.FailedPreStopHook, msg)
+			m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, msg)
 		}
 	}()

@ -568,7 +582,7 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec
 	if reason != "" {
 		message = fmt.Sprint(message, ":", reason)
 	}
-	m.generateContainerEvent(containerID, v1.EventTypeNormal, events.KillingContainer, message)
+	m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message)
 	m.containerRefManager.ClearRef(containerID)

 	return err
--- a/pkg/kubelet/pod_workers.go
+++ b/pkg/kubelet/pod_workers.go
@ -180,7 +180,13 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
 		}
 		if err != nil {
 			glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err)
-			p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod, skipping: %v", err)
+			// if we failed sync, we throw more specific events for why it happened.
+			// as a result, i question the value of this event.
+			// TODO: determine if we can remove this in a future release.
+			// do not include descriptive text that can vary on why it failed so in a pathological
+			// scenario, kubelet does not create enough discrete events that miss default aggregation
+			// window.
+			p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod")
 		}
 		p.wrapUp(update.Pod.UID, err)
 	}