add diagnostic for data mover exposer
Signed-off-by: Lyndon-Li <lyonghui@vmware.com>pull/8511/head
parent
bda3ec1bc4
commit
8087c7f13a
|
@ -689,6 +689,8 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
|
|||
return
|
||||
}
|
||||
|
||||
log.Warn(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)))
|
||||
|
||||
r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))
|
||||
|
||||
log.Info("Dataupload has been cleaned up")
|
||||
|
|
|
@ -971,6 +971,10 @@ func (dt *ddResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc
|
|||
return nil
|
||||
}
|
||||
|
||||
func (dt *ddResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (dt *ddResumeTestHelper) RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -758,6 +758,8 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov
|
|||
volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot
|
||||
}
|
||||
|
||||
log.Warn(ep.DiagnoseExpose(ctx, getOwnerObject(du)))
|
||||
|
||||
ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace)
|
||||
|
||||
log.Info("Dataupload has been cleaned up")
|
||||
|
|
|
@ -300,6 +300,10 @@ func (f *fakeSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev
|
|||
return f.peekErr
|
||||
}
|
||||
|
||||
func (f *fakeSnapshotExposer) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (f *fakeSnapshotExposer) CleanUp(context.Context, corev1.ObjectReference, string, string) {
|
||||
}
|
||||
|
||||
|
@ -1043,6 +1047,10 @@ func (dt *duResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc
|
|||
return nil
|
||||
}
|
||||
|
||||
func (dt *duResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (dt *duResumeTestHelper) CleanUp(context.Context, corev1.ObjectReference, string, string) {}
|
||||
|
||||
func (dt *duResumeTestHelper) newMicroServiceBRWatcher(kbclient.Client, kubernetes.Interface, manager.Manager, string, string, string, string, string, string,
|
||||
|
|
|
@ -308,6 +308,67 @@ func (e *csiSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev1
|
|||
return nil
|
||||
}
|
||||
|
||||
func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string {
|
||||
backupPodName := ownerObject.Name
|
||||
backupPVCName := ownerObject.Name
|
||||
backupVSName := ownerObject.Name
|
||||
|
||||
diag := fmt.Sprintf("***************************begin diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
|
||||
|
||||
pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err)
|
||||
}
|
||||
|
||||
pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err)
|
||||
}
|
||||
|
||||
vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err)
|
||||
}
|
||||
|
||||
if pod != nil {
|
||||
diag += kube.DiagnosePod(pod)
|
||||
|
||||
if pod.Spec.NodeName != "" {
|
||||
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
|
||||
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if pvc != nil {
|
||||
diag += kube.DiagnosePVC(pvc)
|
||||
|
||||
if pvc.Spec.VolumeName != "" {
|
||||
if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil {
|
||||
diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err)
|
||||
} else {
|
||||
diag += kube.DiagnosePV(pv)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if vs != nil {
|
||||
diag += csi.DiagnoseVS(vs)
|
||||
|
||||
if vs.Status.BoundVolumeSnapshotContentName != nil && *vs.Status.BoundVolumeSnapshotContentName != "" {
|
||||
if vsc, err := e.csiSnapshotClient.VolumeSnapshotContents().Get(ctx, *vs.Status.BoundVolumeSnapshotContentName, metav1.GetOptions{}); err != nil {
|
||||
diag += fmt.Sprintf("error getting backup vsc %s, err: %v\n", *vs.Status.BoundVolumeSnapshotContentName, err)
|
||||
} else {
|
||||
diag += csi.DiagnoseVSC(vsc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
diag += fmt.Sprintf("***************************end diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
|
||||
|
||||
return diag
|
||||
}
|
||||
|
||||
const cleanUpTimeout = time.Minute
|
||||
|
||||
func (e *csiSnapshotExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference, vsName string, sourceNamespace string) {
|
||||
|
|
|
@ -30,6 +30,7 @@ import (
|
|||
"k8s.io/client-go/kubernetes"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/vmware-tanzu/velero/pkg/nodeagent"
|
||||
"github.com/vmware-tanzu/velero/pkg/util/boolptr"
|
||||
"github.com/vmware-tanzu/velero/pkg/util/kube"
|
||||
)
|
||||
|
@ -49,6 +50,10 @@ type GenericRestoreExposer interface {
|
|||
// Otherwise, it returns nil immediately.
|
||||
PeekExposed(context.Context, corev1.ObjectReference) error
|
||||
|
||||
// DiagnoseExpose generate the diagnostic info when the expose is not finished for a long time.
|
||||
// If it finds any problem, it returns an string about the problem.
|
||||
DiagnoseExpose(context.Context, corev1.ObjectReference) string
|
||||
|
||||
// RebindVolume unexposes the restored PV and rebind it to the target PVC
|
||||
RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error
|
||||
|
||||
|
@ -195,6 +200,49 @@ func (e *genericRestoreExposer) PeekExposed(ctx context.Context, ownerObject cor
|
|||
return nil
|
||||
}
|
||||
|
||||
func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string {
|
||||
restorePodName := ownerObject.Name
|
||||
restorePVCName := ownerObject.Name
|
||||
|
||||
diag := fmt.Sprintf("***************************begin diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
|
||||
|
||||
pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, restorePodName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
diag += fmt.Sprintf("error to get restore pod %s, err: %v\n", restorePodName, err)
|
||||
}
|
||||
|
||||
pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, restorePVCName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
diag += fmt.Sprintf("error to get restore pvc %s, err: %v\n", restorePVCName, err)
|
||||
}
|
||||
|
||||
if pod != nil {
|
||||
diag += kube.DiagnosePod(pod)
|
||||
|
||||
if pod.Spec.NodeName != "" {
|
||||
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
|
||||
diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if pvc != nil {
|
||||
diag += kube.DiagnosePVC(pvc)
|
||||
|
||||
if pvc.Spec.VolumeName != "" {
|
||||
if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil {
|
||||
diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err)
|
||||
} else {
|
||||
diag += kube.DiagnosePV(pv)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
diag += fmt.Sprintf("***************************end diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name)
|
||||
|
||||
return diag
|
||||
}
|
||||
|
||||
func (e *genericRestoreExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference) {
|
||||
restorePodName := ownerObject.Name
|
||||
restorePVCName := ownerObject.Name
|
||||
|
|
|
@ -26,6 +26,24 @@ func (_m *GenericRestoreExposer) CleanUp(_a0 context.Context, _a1 v1.ObjectRefer
|
|||
_m.Called(_a0, _a1)
|
||||
}
|
||||
|
||||
// DiagnoseExpose provides a mock function with given fields: _a0, _a1
|
||||
func (_m *GenericRestoreExposer) DiagnoseExpose(_a0 context.Context, _a1 v1.ObjectReference) string {
|
||||
ret := _m.Called(_a0, _a1)
|
||||
|
||||
if len(ret) == 0 {
|
||||
panic("no return value specified for DiagnoseExpose")
|
||||
}
|
||||
|
||||
var r0 string
|
||||
if rf, ok := ret.Get(0).(func(context.Context, v1.ObjectReference) string); ok {
|
||||
r0 = rf(_a0, _a1)
|
||||
} else {
|
||||
r0 = ret.Get(0).(string)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Expose provides a mock function with given fields: _a0, _a1, _a2, _a3, _a4, _a5, _a6
|
||||
func (_m *GenericRestoreExposer) Expose(_a0 context.Context, _a1 v1.ObjectReference, _a2 string, _a3 string, _a4 map[string]string, _a5 v1.ResourceRequirements, _a6 time.Duration) error {
|
||||
ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5, _a6)
|
|
@ -37,6 +37,10 @@ type SnapshotExposer interface {
|
|||
// Otherwise, it returns nil immediately.
|
||||
PeekExposed(context.Context, corev1.ObjectReference) error
|
||||
|
||||
// DiagnoseExpose generate the diagnostic info when the expose is not finished for a long time.
|
||||
// If it finds any problem, it returns an string about the problem.
|
||||
DiagnoseExpose(context.Context, corev1.ObjectReference) string
|
||||
|
||||
// CleanUp cleans up any objects generated during the snapshot expose
|
||||
CleanUp(context.Context, corev1.ObjectReference, string, string)
|
||||
}
|
||||
|
|
|
@ -100,8 +100,17 @@ func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace s
|
|||
}
|
||||
}
|
||||
|
||||
// IsRunningInNode checks if the node agent pod is running properly in a specified node. If not, return the error found
|
||||
// KbClientIsRunningInNode checks if the node agent pod is running properly in a specified node through kube client. If not, return the error found
|
||||
func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName string, kubeClient kubernetes.Interface) error {
|
||||
return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient)
|
||||
}
|
||||
|
||||
// IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found
|
||||
func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error {
|
||||
return isRunningInNode(ctx, namespace, nodeName, crClient, nil)
|
||||
}
|
||||
|
||||
func isRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client, kubeClient kubernetes.Interface) error {
|
||||
if nodeName == "" {
|
||||
return errors.New("node name is empty")
|
||||
}
|
||||
|
@ -112,7 +121,12 @@ func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crC
|
|||
return errors.Wrap(err, "fail to parse selector")
|
||||
}
|
||||
|
||||
err = crClient.List(ctx, pods, &ctrlclient.ListOptions{LabelSelector: parsedSelector})
|
||||
if crClient != nil {
|
||||
err = crClient.List(ctx, pods, &ctrlclient.ListOptions{LabelSelector: parsedSelector})
|
||||
} else {
|
||||
pods, err = kubeClient.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: parsedSelector.String()})
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "failed to list daemonset pods")
|
||||
}
|
||||
|
|
|
@ -773,3 +773,45 @@ func WaitUntilVSCHandleIsReady(
|
|||
|
||||
return vsc, nil
|
||||
}
|
||||
|
||||
func DiagnoseVS(vs *snapshotv1api.VolumeSnapshot) string {
|
||||
vscName := ""
|
||||
if vs.Status.BoundVolumeSnapshotContentName != nil {
|
||||
vscName = *vs.Status.BoundVolumeSnapshotContentName
|
||||
}
|
||||
|
||||
readyToUse := false
|
||||
if vs.Status.ReadyToUse != nil {
|
||||
readyToUse = *vs.Status.ReadyToUse
|
||||
}
|
||||
|
||||
errMessage := ""
|
||||
if vs.Status.Error != nil && vs.Status.Error.Message != nil {
|
||||
errMessage = *vs.Status.Error.Message
|
||||
}
|
||||
|
||||
diag := fmt.Sprintf("VS %s/%s, bind to %s, readToUse %v, errMessage %s\n", vs.Namespace, vs.Name, vscName, readyToUse, errMessage)
|
||||
|
||||
return diag
|
||||
}
|
||||
|
||||
func DiagnoseVSC(vsc *snapshotv1api.VolumeSnapshotContent) string {
|
||||
handle := ""
|
||||
if vsc.Status.SnapshotHandle != nil {
|
||||
handle = *vsc.Status.SnapshotHandle
|
||||
}
|
||||
|
||||
readyToUse := false
|
||||
if vsc.Status.ReadyToUse != nil {
|
||||
readyToUse = *vsc.Status.ReadyToUse
|
||||
}
|
||||
|
||||
errMessage := ""
|
||||
if vsc.Status.Error != nil && vsc.Status.Error.Message != nil {
|
||||
errMessage = *vsc.Status.Error.Message
|
||||
}
|
||||
|
||||
diag := fmt.Sprintf("VSC %s, readToUse %v, errMessage %s, handle %s\n", vsc.Name, readyToUse, errMessage, handle)
|
||||
|
||||
return diag
|
||||
}
|
||||
|
|
|
@ -257,3 +257,13 @@ func ToSystemAffinity(loadAffinities []*LoadAffinity) *corev1api.Affinity {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
func DiagnosePod(pod *corev1api.Pod) string {
|
||||
diag := fmt.Sprintf("Pod %s/%s, phase %s, node name %s\n", pod.Namespace, pod.Name, pod.Status.Phase, pod.Spec.NodeName)
|
||||
|
||||
for _, condition := range pod.Status.Conditions {
|
||||
diag += fmt.Sprintf("Pod condition %s, reason %s, message %s\n", condition.Type, condition.Reason, condition.Message)
|
||||
}
|
||||
|
||||
return diag
|
||||
}
|
||||
|
|
|
@ -412,3 +412,20 @@ func GetPVCForPodVolume(vol *corev1api.Volume, pod *corev1api.Pod, crClient crcl
|
|||
|
||||
return pvc, nil
|
||||
}
|
||||
|
||||
func DiagnosePVC(pvc *corev1api.PersistentVolumeClaim) string {
|
||||
diag := fmt.Sprintf("PVC %s/%s, phase %s\n", pvc.Namespace, pvc.Name, pvc.Status.Phase)
|
||||
|
||||
for _, condition := range pvc.Status.Conditions {
|
||||
diag += fmt.Sprintf("PVC condition %s, reason %s, message %s\n", condition.Type, condition.Reason, condition.Message)
|
||||
}
|
||||
|
||||
diag += fmt.Sprintf("PVC is binding to %s\n", pvc.Spec.VolumeName)
|
||||
|
||||
return diag
|
||||
}
|
||||
|
||||
func DiagnosePV(pv *corev1api.PersistentVolume) string {
|
||||
diag := fmt.Sprintf("PV %s, phase %s, reason %s, message %s\n", pv.Name, pv.Status.Phase, pv.Status.Reason, pv.Status.Message)
|
||||
return diag
|
||||
}
|
||||
|
|
|
@ -1463,3 +1463,6 @@ func TestMakePodPVCAttachment(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiagnosePVC(t *testing.T) {
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue