add diagnostic for data mover exposer

Signed-off-by: Lyndon-Li <lyonghui@vmware.com>
pull/8482/head
Lyndon-Li 2024-12-05 15:03:33 +08:00
parent b607259563
commit 34e417bdac
7 changed files with 56 additions and 59 deletions

View File

@ -1 +1 @@
Fix issue #8125, add diagnostic info for data mover exposers when expose timeout
Fix issue #8125, log diagnostic info for data mover exposers when expose timeout

View File

@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"strings"
"time"
"github.com/pkg/errors"
@ -677,7 +678,10 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
return
}
log.Warn(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)))
diags := strings.Split(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)), "\n")
for _, diag := range diags {
log.Warnf("[Diagnose DD expose]%s", diag)
}
r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))

View File

@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"strings"
"time"
snapshotter "github.com/kubernetes-csi/external-snapshotter/client/v7/clientset/versioned/typed/volumesnapshot/v1"
@ -755,7 +756,10 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov
volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot
}
log.Warn(ep.DiagnoseExpose(ctx, getOwnerObject(du)))
diags := strings.Split(ep.DiagnoseExpose(ctx, getOwnerObject(du)), "\n")
for _, diag := range diags {
log.Warnf("[Diagnose DU expose]%s", diag)
}
ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace)

View File

@ -313,20 +313,23 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor
backupPVCName := ownerObject.Name
backupVSName := ownerObject.Name
diag := fmt.Sprintf("***************************begin diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag := "begin diagnose CSI exposer\n"
pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{})
if err != nil {
pod = nil
diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err)
}
pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{})
if err != nil {
pvc = nil
diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err)
}
vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{})
if err != nil {
vs = nil
diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err)
}
@ -335,7 +338,7 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor
if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err)
}
}
}
@ -364,7 +367,7 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor
}
}
diag += fmt.Sprintf("***************************end diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag += "end diagnose CSI exposer"
return diag
}

View File

@ -1166,12 +1166,11 @@ func Test_csiSnapshotExposer_DiagnoseExpose(t *testing.T) {
{
name: "no pod, pvc, vs",
ownerBackup: backup,
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
error getting backup pod fake-backup, err: pods "fake-backup" not found
error getting backup pvc fake-backup, err: persistentvolumeclaims "fake-backup" not found
error getting backup vs fake-backup, err: volumesnapshots.snapshot.storage.k8s.io "fake-backup" not found
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod without node name, pvc without volume name, vs without status",
@ -1183,13 +1182,12 @@ error getting backup vs fake-backup, err: volumesnapshots.snapshot.storage.k8s.i
snapshotClientObj: []runtime.Object{
&backupVSWithoutStatus,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod without node name, pvc without volume name, vs without VSC",
@ -1201,13 +1199,12 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod with node name, no node agent",
@ -1219,14 +1216,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
node-agent is not running in node fake-node
node-agent is not running in node fake-node, err: daemonset pod not found in running state in node fake-node
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pod with node name, node agent is running",
@ -1239,13 +1235,12 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pvc with volume name, no pv",
@ -1258,14 +1253,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
error getting backup pv fake-pv, err: persistentvolumes "fake-pv" not found
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "pvc with volume name, pv exists",
@ -1279,14 +1273,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithoutVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
VS velero/fake-backup, bind to , readyToUse false, errMessage
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "vs with vsc, vsc doesn't exist",
@ -1300,15 +1293,14 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage
snapshotClientObj: []runtime.Object{
&backupVSWithVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
VS velero/fake-backup, bind to fake-vsc, readyToUse false, errMessage fake-vs-message
error getting backup vsc fake-vsc, err: volumesnapshotcontents.snapshot.storage.k8s.io "fake-vsc" not found
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
{
name: "vs with vsc, vsc exists",
@ -1323,15 +1315,14 @@ error getting backup vsc fake-vsc, err: volumesnapshotcontents.snapshot.storage.
&backupVSWithVSC,
&backupVSC,
},
expected: `***************************begin diagnose CSI exposer[velero/fake-backup]***************************
expected: `begin diagnose CSI exposer
Pod velero/fake-backup, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-backup, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
VS velero/fake-backup, bind to fake-vsc, readyToUse false, errMessage fake-vs-message
VSC fake-vsc, readyToUse false, errMessage fake-vsc-message, handle
***************************end diagnose CSI exposer[velero/fake-backup]***************************
`,
end diagnose CSI exposer`,
},
}
for _, tt := range tests {

View File

@ -204,15 +204,17 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject
restorePodName := ownerObject.Name
restorePVCName := ownerObject.Name
diag := fmt.Sprintf("***************************begin diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag := "begin diagnose restore exposer\n"
pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, restorePodName, metav1.GetOptions{})
if err != nil {
pod = nil
diag += fmt.Sprintf("error getting restore pod %s, err: %v\n", restorePodName, err)
}
pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, restorePVCName, metav1.GetOptions{})
if err != nil {
pvc = nil
diag += fmt.Sprintf("error getting restore pvc %s, err: %v\n", restorePVCName, err)
}
@ -221,7 +223,7 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject
if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err)
}
}
}
@ -238,7 +240,7 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject
}
}
diag += fmt.Sprintf("***************************end diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
diag += "end diagnose restore exposer"
return diag
}

View File

@ -646,11 +646,10 @@ func Test_ReastoreDiagnoseExpose(t *testing.T) {
{
name: "no pod, pvc",
ownerRestore: restore,
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
error getting restore pod fake-restore, err: pods "fake-restore" not found
error getting restore pvc fake-restore, err: persistentvolumeclaims "fake-restore" not found
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod without node name, pvc without volume name, vs without status",
@ -659,12 +658,11 @@ error getting restore pvc fake-restore, err: persistentvolumeclaims "fake-restor
&restorePodWithoutNodeName,
&restorePVCWithoutVolumeName,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod without node name, pvc without volume name",
@ -673,12 +671,11 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePodWithoutNodeName,
&restorePVCWithoutVolumeName,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod with node name, no node agent",
@ -687,13 +684,12 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePodWithNodeName,
&restorePVCWithoutVolumeName,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
node-agent is not running in node fake-node
node-agent is not running in node fake-node, err: daemonset pod not found in running state in node fake-node
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pod with node name, node agent is running",
@ -703,12 +699,11 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePVCWithoutVolumeName,
&nodeAgentPod,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pvc with volume name, no pv",
@ -718,13 +713,12 @@ PVC velero/fake-restore, phase Pending, binding to
&restorePVCWithVolumeName,
&nodeAgentPod,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to fake-pv
error getting restore pv fake-pv, err: persistentvolumes "fake-pv" not found
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
{
name: "pvc with volume name, pv exists",
@ -735,13 +729,12 @@ error getting restore pv fake-pv, err: persistentvolumes "fake-pv" not found
&restorePV,
&nodeAgentPod,
},
expected: `***************************begin diagnose restore exposer[velero/fake-restore]***************************
expected: `begin diagnose restore exposer
Pod velero/fake-restore, phase Pending, node name fake-node
Pod condition Initialized, status True, reason , message fake-pod-message
PVC velero/fake-restore, phase Pending, binding to fake-pv
PV fake-pv, phase Pending, reason , message fake-pv-message
***************************end diagnose restore exposer[velero/fake-restore]***************************
`,
end diagnose restore exposer`,
},
}
for _, test := range tests {