add diagnostic for data mover exposer

Signed-off-by: Lyndon-Li <lyonghui@vmware.com>
pull/8511/head
Lyndon-Li 2024-12-04 10:28:50 +08:00
parent bda3ec1bc4
commit 8087c7f13a
13 changed files with 235 additions and 2 deletions

View File

@ -689,6 +689,8 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
return
}
log.Warn(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)))
r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))
log.Info("Dataupload has been cleaned up")

View File

@ -971,6 +971,10 @@ func (dt *ddResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc
return nil
}
func (dt *ddResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
return ""
}
func (dt *ddResumeTestHelper) RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error {
return nil
}

View File

@ -758,6 +758,8 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov
volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot
}
log.Warn(ep.DiagnoseExpose(ctx, getOwnerObject(du)))
ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace)
log.Info("Dataupload has been cleaned up")

View File

@ -300,6 +300,10 @@ func (f *fakeSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev
return f.peekErr
}
func (f *fakeSnapshotExposer) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
return ""
}
func (f *fakeSnapshotExposer) CleanUp(context.Context, corev1.ObjectReference, string, string) {
}
@ -1043,6 +1047,10 @@ func (dt *duResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc
return nil
}
func (dt *duResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
return ""
}
func (dt *duResumeTestHelper) CleanUp(context.Context, corev1.ObjectReference, string, string) {}
func (dt *duResumeTestHelper) newMicroServiceBRWatcher(kbclient.Client, kubernetes.Interface, manager.Manager, string, string, string, string, string, string,

View File

@ -308,6 +308,67 @@ func (e *csiSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev1
return nil
}
func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string {
backupPodName := ownerObject.Name
backupPVCName := ownerObject.Name
backupVSName := ownerObject.Name
diag := fmt.Sprintf("***************************begin diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{})
if err != nil {
diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err)
}
pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{})
if err != nil {
diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err)
}
vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{})
if err != nil {
diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err)
}
if pod != nil {
diag += kube.DiagnosePod(pod)
if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
}
}
}
if pvc != nil {
diag += kube.DiagnosePVC(pvc)
if pvc.Spec.VolumeName != "" {
if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil {
diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err)
} else {
diag += kube.DiagnosePV(pv)
}
}
}
if vs != nil {
diag += csi.DiagnoseVS(vs)
if vs.Status.BoundVolumeSnapshotContentName != nil && *vs.Status.BoundVolumeSnapshotContentName != "" {
if vsc, err := e.csiSnapshotClient.VolumeSnapshotContents().Get(ctx, *vs.Status.BoundVolumeSnapshotContentName, metav1.GetOptions{}); err != nil {
diag += fmt.Sprintf("error getting backup vsc %s, err: %v\n", *vs.Status.BoundVolumeSnapshotContentName, err)
} else {
diag += csi.DiagnoseVSC(vsc)
}
}
}
diag += fmt.Sprintf("***************************end diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
return diag
}
const cleanUpTimeout = time.Minute
func (e *csiSnapshotExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference, vsName string, sourceNamespace string) {

View File

@ -30,6 +30,7 @@ import (
"k8s.io/client-go/kubernetes"
"sigs.k8s.io/controller-runtime/pkg/client"
"github.com/vmware-tanzu/velero/pkg/nodeagent"
"github.com/vmware-tanzu/velero/pkg/util/boolptr"
"github.com/vmware-tanzu/velero/pkg/util/kube"
)
@ -49,6 +50,10 @@ type GenericRestoreExposer interface {
// Otherwise, it returns nil immediately.
PeekExposed(context.Context, corev1.ObjectReference) error
// DiagnoseExpose generate the diagnostic info when the expose is not finished for a long time.
// If it finds any problem, it returns an string about the problem.
DiagnoseExpose(context.Context, corev1.ObjectReference) string
// RebindVolume unexposes the restored PV and rebind it to the target PVC
RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error
@ -195,6 +200,49 @@ func (e *genericRestoreExposer) PeekExposed(ctx context.Context, ownerObject cor
return nil
}
func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string {
restorePodName := ownerObject.Name
restorePVCName := ownerObject.Name
diag := fmt.Sprintf("***************************begin diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, restorePodName, metav1.GetOptions{})
if err != nil {
diag += fmt.Sprintf("error to get restore pod %s, err: %v\n", restorePodName, err)
}
pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, restorePVCName, metav1.GetOptions{})
if err != nil {
diag += fmt.Sprintf("error to get restore pvc %s, err: %v\n", restorePVCName, err)
}
if pod != nil {
diag += kube.DiagnosePod(pod)
if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
}
}
}
if pvc != nil {
diag += kube.DiagnosePVC(pvc)
if pvc.Spec.VolumeName != "" {
if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil {
diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err)
} else {
diag += kube.DiagnosePV(pv)
}
}
}
diag += fmt.Sprintf("***************************end diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
return diag
}
func (e *genericRestoreExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference) {
restorePodName := ownerObject.Name
restorePVCName := ownerObject.Name

View File

@ -26,6 +26,24 @@ func (_m *GenericRestoreExposer) CleanUp(_a0 context.Context, _a1 v1.ObjectRefer
_m.Called(_a0, _a1)
}
// DiagnoseExpose provides a mock function with given fields: _a0, _a1
func (_m *GenericRestoreExposer) DiagnoseExpose(_a0 context.Context, _a1 v1.ObjectReference) string {
ret := _m.Called(_a0, _a1)
if len(ret) == 0 {
panic("no return value specified for DiagnoseExpose")
}
var r0 string
if rf, ok := ret.Get(0).(func(context.Context, v1.ObjectReference) string); ok {
r0 = rf(_a0, _a1)
} else {
r0 = ret.Get(0).(string)
}
return r0
}
// Expose provides a mock function with given fields: _a0, _a1, _a2, _a3, _a4, _a5, _a6
func (_m *GenericRestoreExposer) Expose(_a0 context.Context, _a1 v1.ObjectReference, _a2 string, _a3 string, _a4 map[string]string, _a5 v1.ResourceRequirements, _a6 time.Duration) error {
ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5, _a6)

View File

@ -37,6 +37,10 @@ type SnapshotExposer interface {
// Otherwise, it returns nil immediately.
PeekExposed(context.Context, corev1.ObjectReference) error
// DiagnoseExpose generate the diagnostic info when the expose is not finished for a long time.
// If it finds any problem, it returns an string about the problem.
DiagnoseExpose(context.Context, corev1.ObjectReference) string
// CleanUp cleans up any objects generated during the snapshot expose
CleanUp(context.Context, corev1.ObjectReference, string, string)
}

View File

@ -100,8 +100,17 @@ func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace s
}
}
// IsRunningInNode checks if the node agent pod is running properly in a specified node. If not, return the error found
// KbClientIsRunningInNode checks if the node agent pod is running properly in a specified node through kube client. If not, return the error found
func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName string, kubeClient kubernetes.Interface) error {
return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient)
}
// IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found
func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error {
return isRunningInNode(ctx, namespace, nodeName, crClient, nil)
}
func isRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client, kubeClient kubernetes.Interface) error {
if nodeName == "" {
return errors.New("node name is empty")
}
@ -112,7 +121,12 @@ func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crC
return errors.Wrap(err, "fail to parse selector")
}
err = crClient.List(ctx, pods, &ctrlclient.ListOptions{LabelSelector: parsedSelector})
if crClient != nil {
err = crClient.List(ctx, pods, &ctrlclient.ListOptions{LabelSelector: parsedSelector})
} else {
pods, err = kubeClient.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: parsedSelector.String()})
}
if err != nil {
return errors.Wrap(err, "failed to list daemonset pods")
}

View File

@ -773,3 +773,45 @@ func WaitUntilVSCHandleIsReady(
return vsc, nil
}
func DiagnoseVS(vs *snapshotv1api.VolumeSnapshot) string {
vscName := ""
if vs.Status.BoundVolumeSnapshotContentName != nil {
vscName = *vs.Status.BoundVolumeSnapshotContentName
}
readyToUse := false
if vs.Status.ReadyToUse != nil {
readyToUse = *vs.Status.ReadyToUse
}
errMessage := ""
if vs.Status.Error != nil && vs.Status.Error.Message != nil {
errMessage = *vs.Status.Error.Message
}
diag := fmt.Sprintf("VS %s/%s, bind to %s, readToUse %v, errMessage %s\n", vs.Namespace, vs.Name, vscName, readyToUse, errMessage)
return diag
}
func DiagnoseVSC(vsc *snapshotv1api.VolumeSnapshotContent) string {
handle := ""
if vsc.Status.SnapshotHandle != nil {
handle = *vsc.Status.SnapshotHandle
}
readyToUse := false
if vsc.Status.ReadyToUse != nil {
readyToUse = *vsc.Status.ReadyToUse
}
errMessage := ""
if vsc.Status.Error != nil && vsc.Status.Error.Message != nil {
errMessage = *vsc.Status.Error.Message
}
diag := fmt.Sprintf("VSC %s, readToUse %v, errMessage %s, handle %s\n", vsc.Name, readyToUse, errMessage, handle)
return diag
}

View File

@ -257,3 +257,13 @@ func ToSystemAffinity(loadAffinities []*LoadAffinity) *corev1api.Affinity {
return nil
}
func DiagnosePod(pod *corev1api.Pod) string {
diag := fmt.Sprintf("Pod %s/%s, phase %s, node name %s\n", pod.Namespace, pod.Name, pod.Status.Phase, pod.Spec.NodeName)
for _, condition := range pod.Status.Conditions {
diag += fmt.Sprintf("Pod condition %s, reason %s, message %s\n", condition.Type, condition.Reason, condition.Message)
}
return diag
}

View File

@ -412,3 +412,20 @@ func GetPVCForPodVolume(vol *corev1api.Volume, pod *corev1api.Pod, crClient crcl
return pvc, nil
}
func DiagnosePVC(pvc *corev1api.PersistentVolumeClaim) string {
diag := fmt.Sprintf("PVC %s/%s, phase %s\n", pvc.Namespace, pvc.Name, pvc.Status.Phase)
for _, condition := range pvc.Status.Conditions {
diag += fmt.Sprintf("PVC condition %s, reason %s, message %s\n", condition.Type, condition.Reason, condition.Message)
}
diag += fmt.Sprintf("PVC is binding to %s\n", pvc.Spec.VolumeName)
return diag
}
func DiagnosePV(pv *corev1api.PersistentVolume) string {
diag := fmt.Sprintf("PV %s, phase %s, reason %s, message %s\n", pv.Name, pv.Status.Phase, pv.Status.Reason, pv.Status.Message)
return diag
}

View File

@ -1463,3 +1463,6 @@ func TestMakePodPVCAttachment(t *testing.T) {
})
}
}
func TestDiagnosePVC(t *testing.T) {
}