Merge pull request #8518 from Lyndon-Li/fail-fs-backup-on-windows-nodes

fs-backup for clusters with windows nodes
pull/8557/head
lyndon-li 2024-12-24 15:15:15 +08:00 committed by GitHub
commit 78c97d93b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 329 additions and 22 deletions

View File

@ -0,0 +1 @@
Make fs-backup work on linux nodes with the new Velero deployment and disable fs-backup if the source/target pod is running in non-linux node (#8424)

View File

@ -398,7 +398,9 @@ func (o *Options) Run(c *cobra.Command, f client.Factory) error {
if _, err = install.NodeAgentIsReady(dynamicFactory, o.Namespace); err != nil {
return errors.Wrap(err, errorMsg)
}
}
if o.UseNodeAgentWindows {
fmt.Println("Waiting for node-agent-windows daemonset to be ready.")
if _, err = install.NodeAgentWindowsIsReady(dynamicFactory, o.Namespace); err != nil {
return errors.Wrap(err, errorMsg)

View File

@ -82,6 +82,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/restore"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/kube"
"github.com/vmware-tanzu/velero/pkg/util/logging"
)
@ -471,10 +472,20 @@ func (s *server) veleroResourcesExist() error {
func (s *server) checkNodeAgent() {
// warn if node agent does not exist
if err := nodeagent.IsRunning(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found; pod volume backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent")
if kube.WithLinuxNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnLinux(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for linux nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for linux nodes")
}
}
if kube.WithWindowsNode(s.ctx, s.crClient, s.logger) {
if err := nodeagent.IsRunningOnWindows(s.ctx, s.kubeClient, s.namespace); err == nodeagent.ErrDaemonSetNotFound {
s.logger.Warn("Velero node agent not found for Windows nodes; pod volume backups/restores and data mover backups/restores will not work until it's created")
} else if err != nil {
s.logger.WithError(errors.WithStack(err)).Warn("Error checking for existence of velero node agent for Windows nodes")
}
}
}

View File

@ -33,9 +33,12 @@ import (
)
const (
// daemonSet is the name of the Velero node agent daemonset.
// daemonSet is the name of the Velero node agent daemonset on linux nodes.
daemonSet = "node-agent"
// daemonsetWindows is the name of the Velero node agent daemonset on Windows nodes.
daemonsetWindows = "node-agent-windows"
// nodeAgentRole marks pods with node-agent role on all nodes.
nodeAgentRole = "node-agent"
)
@ -92,9 +95,16 @@ type Configs struct {
PodResources *kube.PodResources `json:"podResources,omitempty"`
}
// IsRunning checks if the node agent daemonset is running properly. If not, return the error found
func IsRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonSet, metav1.GetOptions{}); apierrors.IsNotFound(err) {
func IsRunningOnLinux(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonSet)
}
func IsRunningOnWindows(ctx context.Context, kubeClient kubernetes.Interface, namespace string) error {
return isRunning(ctx, kubeClient, namespace, daemonsetWindows)
}
func isRunning(ctx context.Context, kubeClient kubernetes.Interface, namespace string, daemonset string) error {
if _, err := kubeClient.AppsV1().DaemonSets(namespace).Get(ctx, daemonset, metav1.GetOptions{}); apierrors.IsNotFound(err) {
return ErrDaemonSetNotFound
} else if err != nil {
return err

View File

@ -40,7 +40,7 @@ type reactor struct {
}
func TestIsRunning(t *testing.T) {
daemonSet := &appsv1.DaemonSet{
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: "fake-ns",
Name: "node-agent",
@ -80,7 +80,7 @@ func TestIsRunning(t *testing.T) {
name: "succeed",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
daemonSet,
ds,
},
},
}
@ -93,7 +93,7 @@ func TestIsRunning(t *testing.T) {
fakeKubeClient.Fake.PrependReactor(reactor.verb, reactor.resource, reactor.reactorFunc)
}
err := IsRunning(context.TODO(), fakeKubeClient, test.namespace)
err := isRunning(context.TODO(), fakeKubeClient, test.namespace, daemonSet)
if test.expectErr == "" {
assert.NoError(t, err)
} else {

View File

@ -206,6 +206,12 @@ func (b *backupper) BackupPodVolumes(backup *velerov1api.Backup, pod *corev1api.
return nil, pvcSummary, nil
}
if err := kube.IsLinuxNode(b.ctx, pod.Spec.NodeName, b.crClient); err != nil {
err := errors.Wrapf(err, "Pod %s/%s is not running in linux node(%s), skip", pod.Namespace, pod.Name, pod.Spec.NodeName)
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)
return nil, pvcSummary, []error{err}
}
err := nodeagent.IsRunningInNode(b.ctx, backup.Namespace, pod.Spec.NodeName, b.crClient)
if err != nil {
skipAllPodVolumes(pod, volumesToBackup, err, pvcSummary, log)

View File

@ -303,6 +303,14 @@ func createPVBObj(fail bool, withSnapshot bool, index int, uploaderType string)
return pvbObj
}
func createNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
}
func createWindowsNodeObj() *corev1api.Node {
return builder.ForNode("fake-node-name").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
}
func TestBackupPodVolumes(t *testing.T) {
scheme := runtime.NewScheme()
velerov1api.AddToScheme(scheme)
@ -358,13 +366,32 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
},
{
name: "pod is not running on Linux node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createWindowsNodeObj(),
},
sourcePod: createPodObj(false, false, false, 2),
uploaderType: "kopia",
errs: []string{
"Pod fake-ns/fake-pod is not running in linux node(fake-node-name), skip",
},
},
{
name: "node-agent pod is not running in node",
volumes: []string{
"fake-volume-1",
"fake-volume-2",
},
sourcePod: createPodObj(true, false, false, 2),
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
"daemonset pod not found in running state in node fake-node-name",
@ -379,6 +406,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
mockGetRepositoryType: true,
@ -395,6 +423,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
uploaderType: "kopia",
errs: []string{
@ -410,6 +439,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, false, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
@ -427,6 +457,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
@ -448,6 +479,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
},
@ -471,6 +503,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, true),
@ -482,6 +515,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "volume not mounted by pod should be skipped",
@ -492,6 +526,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, false, 2),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVCObj(2),
createPVObj(1, false),
@ -503,6 +538,7 @@ func TestBackupPodVolumes(t *testing.T) {
runtimeScheme: scheme,
uploaderType: "kopia",
bsl: "fake-bsl",
errs: []string{},
},
{
name: "return completed pvbs",
@ -512,6 +548,7 @@ func TestBackupPodVolumes(t *testing.T) {
sourcePod: createPodObj(true, true, true, 1),
kubeClientObj: []runtime.Object{
createNodeAgentPodObj(true),
createNodeObj(),
createPVCObj(1),
createPVObj(1, false),
},
@ -522,6 +559,7 @@ func TestBackupPodVolumes(t *testing.T) {
uploaderType: "kopia",
bsl: "fake-bsl",
pvbs: 1,
errs: []string{},
},
}
// TODO add more verification around PVCBackupSummary returned by "BackupPodVolumes"
@ -568,8 +606,8 @@ func TestBackupPodVolumes(t *testing.T) {
pvbs, _, errs := bp.BackupPodVolumes(backupObj, test.sourcePod, test.volumes, nil, velerotest.NewLogger())
if errs == nil {
assert.Nil(t, test.errs)
if test.errs == nil {
assert.NoError(t, err)
} else {
for i := 0; i < len(errs); i++ {
assert.EqualError(t, errs[i], test.errs[i])

View File

@ -122,7 +122,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
return nil
}
if err := nodeagent.IsRunning(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
if err := nodeagent.IsRunningOnLinux(r.ctx, r.kubeClient, data.Restore.Namespace); err != nil {
return []error{errors.Wrapf(err, "error to check node agent status")}
}
@ -213,6 +213,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData, tracker *volume.RestoreVo
} else if err != nil {
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
} else {
if err := kube.IsLinuxNode(checkCtx, nodeName, r.crClient); err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("Restored pod is not running in linux node")
r.nodeAgentCheck <- errors.Wrapf(err, "restored pod %s/%s is not running in linux node(%s)", data.Pod.Namespace, data.Pod.Name, nodeName)
return
}
err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.crClient)
if err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")

View File

@ -33,7 +33,6 @@ import (
"k8s.io/client-go/kubernetes"
kubefake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/tools/cache"
ctrlfake "sigs.k8s.io/controller-runtime/pkg/client/fake"
"github.com/vmware-tanzu/velero/internal/volume"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
@ -314,6 +313,30 @@ func TestRestorePodVolumes(t *testing.T) {
},
},
},
{
name: "pod is not running on linux nodes",
pvbs: []*velerov1api.PodVolumeBackup{
createPVBObj(true, true, 1, "kopia"),
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createWindowsNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
ctlClientObj: []runtime.Object{
createBackupRepoObj(),
},
restoredPod: createPodObj(true, true, true, 1),
sourceNamespace: "fake-ns",
bsl: "fake-bsl",
runtimeScheme: scheme,
errs: []expectError{
{
err: "restored pod fake-ns/fake-pod is not running in linux node(fake-node-name): os type windows for node fake-node-name is not linux",
},
},
},
{
name: "node-agent pod is not running",
pvbs: []*velerov1api.PodVolumeBackup{
@ -321,6 +344,7 @@ func TestRestorePodVolumes(t *testing.T) {
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
},
@ -344,6 +368,7 @@ func TestRestorePodVolumes(t *testing.T) {
},
kubeClientObj: []runtime.Object{
createNodeAgentDaemonset(),
createNodeObj(),
createPVCObj(1),
createPodObj(true, true, true, 1),
createNodeAgentPodObj(true),
@ -368,11 +393,6 @@ func TestRestorePodVolumes(t *testing.T) {
ctx = test.ctx
}
fakeClientBuilder := ctrlfake.NewClientBuilder()
if test.runtimeScheme != nil {
fakeClientBuilder = fakeClientBuilder.WithScheme(test.runtimeScheme)
}
objClient := append(test.ctlClientObj, test.kubeClientObj...)
objClient = append(objClient, test.veleroClientObj...)
@ -438,7 +458,8 @@ func TestRestorePodVolumes(t *testing.T) {
for i := 0; i < len(errs); i++ {
j := 0
for ; j < len(test.errs); j++ {
if errs[i].Error() == test.errs[j].err {
err := errs[i].Error()
if err == test.errs[j].err {
break
}
}

80
pkg/util/kube/node.go Normal file
View File

@ -0,0 +1,80 @@
/*
Copyright The Velero Contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kube
import (
"context"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
corev1api "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)
func IsLinuxNode(ctx context.Context, nodeName string, client client.Client) error {
node := &corev1api.Node{}
if err := client.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
return errors.Wrapf(err, "error getting node %s", nodeName)
}
os, found := node.Labels["kubernetes.io/os"]
if !found {
return errors.Errorf("no os type label for node %s", nodeName)
}
if os != "linux" {
return errors.Errorf("os type %s for node %s is not linux", os, nodeName)
}
return nil
}
func WithLinuxNode(ctx context.Context, client client.Client, log logrus.FieldLogger) bool {
return withOSNode(ctx, client, "linux", log)
}
func WithWindowsNode(ctx context.Context, client client.Client, log logrus.FieldLogger) bool {
return withOSNode(ctx, client, "windows", log)
}
func withOSNode(ctx context.Context, client client.Client, osType string, log logrus.FieldLogger) bool {
nodeList := new(corev1api.NodeList)
if err := client.List(ctx, nodeList); err != nil {
log.Warnf("Failed to list nodes, cannot decide existence of nodes of OS %s", osType)
return false
}
allNodeLabeled := true
for _, node := range nodeList.Items {
os, found := node.Labels["kubernetes.io/os"]
if os == osType {
return true
}
if !found {
allNodeLabeled = false
}
}
if !allNodeLabeled {
log.Warnf("Not all nodes have os type label, cannot decide existence of nodes of OS %s", osType)
}
return false
}

132
pkg/util/kube/node_test.go Normal file
View File

@ -0,0 +1,132 @@
/*
Copyright The Velero Contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kube
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"github.com/vmware-tanzu/velero/pkg/builder"
clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
)
func TestIsLinuxNode(t *testing.T) {
nodeNoOSLabel := builder.ForNode("fake-node").Result()
nodeWindows := builder.ForNode("fake-node").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
nodeLinux := builder.ForNode("fake-node").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
scheme := runtime.NewScheme()
corev1.AddToScheme(scheme)
tests := []struct {
name string
kubeClientObj []runtime.Object
err string
}{
{
name: "error getting node",
err: "error getting node fake-node: nodes \"fake-node\" not found",
},
{
name: "no os label",
kubeClientObj: []runtime.Object{
nodeNoOSLabel,
},
err: "no os type label for node fake-node",
},
{
name: "os label does not match",
kubeClientObj: []runtime.Object{
nodeWindows,
},
err: "os type windows for node fake-node is not linux",
},
{
name: "succeed",
kubeClientObj: []runtime.Object{
nodeLinux,
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
fakeClientBuilder := clientFake.NewClientBuilder()
fakeClientBuilder = fakeClientBuilder.WithScheme(scheme)
fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build()
err := IsLinuxNode(context.TODO(), "fake-node", fakeClient)
if err != nil {
assert.EqualError(t, err, test.err)
} else {
assert.NoError(t, err)
}
})
}
}
func TestWithLinuxNode(t *testing.T) {
nodeWindows := builder.ForNode("fake-node-1").Labels(map[string]string{"kubernetes.io/os": "windows"}).Result()
nodeLinux := builder.ForNode("fake-node-2").Labels(map[string]string{"kubernetes.io/os": "linux"}).Result()
scheme := runtime.NewScheme()
corev1.AddToScheme(scheme)
tests := []struct {
name string
kubeClientObj []runtime.Object
result bool
}{
{
name: "error listing node",
},
{
name: "with node of other type",
kubeClientObj: []runtime.Object{
nodeWindows,
},
},
{
name: "with node of the same type",
kubeClientObj: []runtime.Object{
nodeWindows,
nodeLinux,
},
result: true,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
fakeClientBuilder := clientFake.NewClientBuilder()
fakeClientBuilder = fakeClientBuilder.WithScheme(scheme)
fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build()
result := withOSNode(context.TODO(), fakeClient, "linux", velerotest.NewLogger())
assert.Equal(t, test.result, result)
})
}
}