only failback to next driver if provisioning fails

pull/7389/head
Sharif Elgamal 2020-04-03 17:46:42 -07:00
parent 31f225d90e
commit 3d037a8871
4 changed files with 168 additions and 82 deletions

View File

@ -49,7 +49,22 @@ var nodeStartCmd = &cobra.Command{
exit.WithError("retrieving node", err)
}
_, err = node.Start(*cc, *n, nil, false)
r, p, m, h, err := node.Provision(*cc, *n, false)
if err != nil {
exit.WithError("provisioning host for node", err)
}
s := node.Starter{
Runner: r,
PreExists: p,
MachineAPI: m,
Host: h,
Cfg: *cc,
Node: *n,
ExistingAddons: nil,
}
_, err = node.Start(s, false)
if err != nil {
_, err := maybeDeleteAndRetry(*cc, *n, nil, err)
if err != nil {

View File

@ -305,38 +305,49 @@ func runStart(cmd *cobra.Command, args []string) {
validateSpecifiedDriver(existing)
ds, alts, specified := selectDriver(existing)
err = startWithDriver(cmd, ds, existing)
starter, err := provisionWithDriver(cmd, ds, existing)
if err != nil && !specified {
success := false
// Walk down the rest of the options
for _, alt := range alts {
out.WarningT("Startup with {{.old_driver}} driver failed, trying with {{.new_driver}}.", out.V{"old_driver": ds.Name, "new_driver": alt.Name})
out.WarningT("Startup with {{.old_driver}} driver failed, trying with {{.new_driver}}: {{.error}}", out.V{"old_driver": ds.Name, "new_driver": alt.Name, "error": err})
ds = alt
// Delete the existing cluster and try again with the next driver on the list
profile, err := config.LoadProfile(ClusterFlagValue())
if err != nil {
out.ErrT(out.Meh, `"{{.name}}" profile does not exist, trying anyways.`, out.V{"name": ClusterFlagValue()})
glog.Warningf("%s profile does not exist, trying anyways.", ClusterFlagValue())
}
err = deleteProfile(profile)
if err != nil {
out.WarningT("Failed to delete cluster {{.name}}, proceeding with retry anyway.", out.V{"name": ClusterFlagValue()})
}
err = startWithDriver(cmd, ds, existing)
starter, err = provisionWithDriver(cmd, ds, existing)
if err != nil {
continue
} else {
// Success!
os.Exit(0)
success = true
break
}
}
if !success {
exit.WithError("error provisioning host", err)
}
}
// Use the most recent error
exit.WithError("startup failed", err)
kubeconfig, err := startWithDriver(starter, existing)
if err != nil {
exit.WithError("failed to start node", err)
}
if err := showKubectlInfo(kubeconfig, starter.Node.KubernetesVersion, starter.Cfg.Name); err != nil {
glog.Errorf("kubectl info: %v", err)
}
}
func startWithDriver(cmd *cobra.Command, ds registry.DriverState, existing *config.ClusterConfig) error {
func provisionWithDriver(cmd *cobra.Command, ds registry.DriverState, existing *config.ClusterConfig) (node.Starter, error) {
driverName := ds.Name
glog.Infof("selected driver: %s", driverName)
validateDriver(ds, existing)
@ -356,19 +367,19 @@ func startWithDriver(cmd *cobra.Command, ds registry.DriverState, existing *conf
k8sVersion := getKubernetesVersion(existing)
cc, n, err := generateCfgFromFlags(cmd, k8sVersion, driverName)
if err != nil {
return errors.Wrap(err, "Failed to generate config")
return node.Starter{}, errors.Wrap(err, "Failed to generate config")
}
// This is about as far as we can go without overwriting config files
if viper.GetBool(dryRun) {
out.T(out.DryRun, `dry-run validation complete!`)
return nil
return node.Starter{}, nil
}
if driver.IsVM(driverName) {
url, err := download.ISO(viper.GetStringSlice(isoURL), cmd.Flags().Changed(isoURL))
if err != nil {
return errors.Wrap(err, "Failed to cache ISO")
return node.Starter{}, errors.Wrap(err, "Failed to cache ISO")
}
cc.MinikubeISO = url
}
@ -387,11 +398,29 @@ func startWithDriver(cmd *cobra.Command, ds registry.DriverState, existing *conf
}
}
kubeconfig, err := node.Start(cc, n, existingAddons, true)
mRunner, preExists, mAPI, host, err := node.Provision(cc, n, true)
if err != nil {
kubeconfig, err = maybeDeleteAndRetry(cc, n, existingAddons, err)
return node.Starter{}, err
}
return node.Starter{
Runner: mRunner,
PreExists: preExists,
MachineAPI: mAPI,
Host: host,
ExistingAddons: existingAddons,
Cfg: cc,
Node: n,
}, nil
}
func startWithDriver(starter node.Starter, existing *config.ClusterConfig) (*kubeconfig.Settings, error) {
kubeconfig, err := node.Start(starter, true)
if err != nil {
kubeconfig, err = maybeDeleteAndRetry(starter.Cfg, starter.Node, starter.ExistingAddons, err)
if err != nil {
return err
return nil, err
}
}
@ -400,7 +429,7 @@ func startWithDriver(cmd *cobra.Command, ds registry.DriverState, existing *conf
numNodes = len(existing.Nodes)
}
if numNodes > 1 {
if driver.BareMetal(driverName) {
if driver.BareMetal(starter.Cfg.Driver) {
exit.WithCodeT(exit.Config, "The none driver is not compatible with multi-node clusters.")
} else {
for i := 1; i < numNodes; i++ {
@ -409,22 +438,18 @@ func startWithDriver(cmd *cobra.Command, ds registry.DriverState, existing *conf
Name: nodeName,
Worker: true,
ControlPlane: false,
KubernetesVersion: cc.KubernetesConfig.KubernetesVersion,
KubernetesVersion: starter.Cfg.KubernetesConfig.KubernetesVersion,
}
out.Ln("") // extra newline for clarity on the command line
err := node.Add(&cc, n)
err := node.Add(&starter.Cfg, n)
if err != nil {
return errors.Wrap(err, "adding node")
return nil, errors.Wrap(err, "adding node")
}
}
}
}
if err := showKubectlInfo(kubeconfig, k8sVersion, cc.Name); err != nil {
glog.Errorf("kubectl info: %v", err)
}
return nil
return kubeconfig, nil
}
func updateDriver(driverName string) {
@ -514,9 +539,24 @@ func maybeDeleteAndRetry(cc config.ClusterConfig, n config.Node, existingAddons
}
var kubeconfig *kubeconfig.Settings
for _, v := range cc.Nodes {
k, err := node.Start(cc, v, existingAddons, v.ControlPlane)
if v.ControlPlane {
for _, n := range cc.Nodes {
r, p, m, h, err := node.Provision(cc, n, n.ControlPlane)
s := node.Starter{
Runner: r,
PreExists: p,
MachineAPI: m,
Host: h,
Cfg: cc,
Node: n,
ExistingAddons: existingAddons,
}
if err != nil {
// Ok we failed again, let's bail
return nil, err
}
k, err := node.Start(s, n.ControlPlane)
if n.ControlPlane {
kubeconfig = k
}
if err != nil {

View File

@ -39,7 +39,21 @@ func Add(cc *config.ClusterConfig, n config.Node) error {
return errors.Wrap(err, "save node")
}
_, err := Start(*cc, n, nil, false)
r, p, m, h, err := Provision(*cc, n, false)
if err != nil {
return err
}
s := Starter{
Runner: r,
PreExists: p,
MachineAPI: m,
Host: h,
Cfg: *cc,
Node: n,
ExistingAddons: nil,
}
_, err = Start(s, false)
return err
}

View File

@ -63,63 +63,50 @@ const (
containerRuntime = "container-runtime"
)
var (
kicGroup errgroup.Group
cacheGroup errgroup.Group
)
// Starter is a struct with all the necessary information to start a node
type Starter struct {
Runner command.Runner
PreExists bool
MachineAPI libmachine.API
Host *host.Host
Cfg config.ClusterConfig
Node config.Node
ExistingAddons map[string]bool
}
// Start spins up a guest and starts the kubernetes node.
func Start(cc config.ClusterConfig, n config.Node, existingAddons map[string]bool, apiServer bool) (*kubeconfig.Settings, error) {
cp := ""
if apiServer {
cp = "control plane "
}
out.T(out.ThumbsUp, "Starting {{.controlPlane}}node {{.name}} in cluster {{.cluster}}", out.V{"controlPlane": cp, "name": n.Name, "cluster": cc.Name})
var kicGroup errgroup.Group
if driver.IsKIC(cc.Driver) {
beginDownloadKicArtifacts(&kicGroup)
}
var cacheGroup errgroup.Group
if !driver.BareMetal(cc.Driver) {
beginCacheKubernetesImages(&cacheGroup, cc.KubernetesConfig.ImageRepository, n.KubernetesVersion, cc.KubernetesConfig.ContainerRuntime)
}
// Abstraction leakage alert: startHost requires the config to be saved, to satistfy pkg/provision/buildroot.
// Hence, saveConfig must be called before startHost, and again afterwards when we know the IP.
if err := config.SaveProfile(viper.GetString(config.ProfileName), &cc); err != nil {
return nil, errors.Wrap(err, "Failed to save config")
}
handleDownloadOnly(&cacheGroup, &kicGroup, n.KubernetesVersion)
waitDownloadKicArtifacts(&kicGroup)
mRunner, preExists, machineAPI, host := startMachine(&cc, &n)
defer machineAPI.Close()
func Start(starter Starter, apiServer bool) (*kubeconfig.Settings, error) {
// wait for preloaded tarball to finish downloading before configuring runtimes
waitCacheRequiredImages(&cacheGroup)
sv, err := util.ParseKubernetesVersion(n.KubernetesVersion)
sv, err := util.ParseKubernetesVersion(starter.Node.KubernetesVersion)
if err != nil {
return nil, errors.Wrap(err, "Failed to parse kubernetes version")
}
// configure the runtime (docker, containerd, crio)
cr := configureRuntimes(mRunner, cc.Driver, cc.KubernetesConfig, sv)
showVersionInfo(n.KubernetesVersion, cr)
cr := configureRuntimes(starter.Runner, starter.Cfg.Driver, starter.Cfg.KubernetesConfig, sv)
showVersionInfo(starter.Node.KubernetesVersion, cr)
var bs bootstrapper.Bootstrapper
var kcs *kubeconfig.Settings
if apiServer {
// Must be written before bootstrap, otherwise health checks may flake due to stale IP
kcs = setupKubeconfig(host, &cc, &n, cc.Name)
kcs = setupKubeconfig(starter.Host, &starter.Cfg, &starter.Node, starter.Cfg.Name)
if err != nil {
return nil, errors.Wrap(err, "Failed to setup kubeconfig")
}
// setup kubeadm (must come after setupKubeconfig)
bs = setupKubeAdm(machineAPI, cc, n)
err = bs.StartCluster(cc)
bs = setupKubeAdm(starter.MachineAPI, starter.Cfg, starter.Node)
err = bs.StartCluster(starter.Cfg)
if err != nil {
out.LogEntries("Error starting cluster", err, logs.FindProblems(cr, bs, cc, mRunner))
out.LogEntries("Error starting cluster", err, logs.FindProblems(cr, bs, starter.Cfg, starter.Runner))
return nil, err
}
@ -128,12 +115,12 @@ func Start(cc config.ClusterConfig, n config.Node, existingAddons map[string]boo
return nil, errors.Wrap(err, "Failed to update kubeconfig file.")
}
} else {
bs, err = cluster.Bootstrapper(machineAPI, viper.GetString(cmdcfg.Bootstrapper), cc, n)
bs, err = cluster.Bootstrapper(starter.MachineAPI, viper.GetString(cmdcfg.Bootstrapper), starter.Cfg, starter.Node)
if err != nil {
return nil, errors.Wrap(err, "Failed to get bootstrapper")
}
if err = bs.SetupCerts(cc.KubernetesConfig, n); err != nil {
if err = bs.SetupCerts(starter.Cfg.KubernetesConfig, starter.Node); err != nil {
return nil, errors.Wrap(err, "setting up certs")
}
}
@ -145,43 +132,43 @@ func Start(cc config.ClusterConfig, n config.Node, existingAddons map[string]boo
}
// enable addons, both old and new!
if existingAddons != nil {
addons.Start(viper.GetString(config.ProfileName), existingAddons, config.AddonList)
if starter.ExistingAddons != nil {
addons.Start(viper.GetString(config.ProfileName), starter.ExistingAddons, config.AddonList)
}
if apiServer {
// special ops for none , like change minikube directory.
// multinode super doesn't work on the none driver
if cc.Driver == driver.None && len(cc.Nodes) == 1 {
if starter.Cfg.Driver == driver.None && len(starter.Cfg.Nodes) == 1 {
prepareNone()
}
// Skip pre-existing, because we already waited for health
if viper.GetBool(waitUntilHealthy) && !preExists {
if err := bs.WaitForNode(cc, n, viper.GetDuration(waitTimeout)); err != nil {
if viper.GetBool(waitUntilHealthy) && !starter.PreExists {
if err := bs.WaitForNode(starter.Cfg, starter.Node, viper.GetDuration(waitTimeout)); err != nil {
return nil, errors.Wrap(err, "Wait failed")
}
}
} else {
if err := bs.UpdateNode(cc, n, cr); err != nil {
if err := bs.UpdateNode(starter.Cfg, starter.Node, cr); err != nil {
return nil, errors.Wrap(err, "Updating node")
}
cp, err := config.PrimaryControlPlane(&cc)
cp, err := config.PrimaryControlPlane(&starter.Cfg)
if err != nil {
return nil, errors.Wrap(err, "Getting primary control plane")
}
cpBs, err := cluster.Bootstrapper(machineAPI, viper.GetString(cmdcfg.Bootstrapper), cc, cp)
cpBs, err := cluster.Bootstrapper(starter.MachineAPI, viper.GetString(cmdcfg.Bootstrapper), starter.Cfg, cp)
if err != nil {
return nil, errors.Wrap(err, "Getting bootstrapper")
}
joinCmd, err := cpBs.GenerateToken(cc)
joinCmd, err := cpBs.GenerateToken(starter.Cfg)
if err != nil {
return nil, errors.Wrap(err, "generating join token")
}
if err = bs.JoinCluster(cc, n, joinCmd); err != nil {
if err = bs.JoinCluster(starter.Cfg, starter.Node, joinCmd); err != nil {
return nil, errors.Wrap(err, "joining cluster")
}
}
@ -189,6 +176,36 @@ func Start(cc config.ClusterConfig, n config.Node, existingAddons map[string]boo
return kcs, nil
}
// Provision provisions the machine/container for the node
func Provision(cc config.ClusterConfig, n config.Node, apiServer bool) (command.Runner, bool, libmachine.API, *host.Host, error) {
cp := ""
if apiServer {
cp = "control plane "
}
out.T(out.ThumbsUp, "Starting {{.controlPlane}}node {{.name}} in cluster {{.cluster}}", out.V{"controlPlane": cp, "name": n.Name, "cluster": cc.Name})
if driver.IsKIC(cc.Driver) {
beginDownloadKicArtifacts(&kicGroup)
}
if !driver.BareMetal(cc.Driver) {
beginCacheKubernetesImages(&cacheGroup, cc.KubernetesConfig.ImageRepository, n.KubernetesVersion, cc.KubernetesConfig.ContainerRuntime)
}
// Abstraction leakage alert: startHost requires the config to be saved, to satistfy pkg/provision/buildroot.
// Hence, saveConfig must be called before startHost, and again afterwards when we know the IP.
if err := config.SaveProfile(viper.GetString(config.ProfileName), &cc); err != nil {
return nil, false, nil, nil, errors.Wrap(err, "Failed to save config")
}
handleDownloadOnly(&cacheGroup, &kicGroup, n.KubernetesVersion)
waitDownloadKicArtifacts(&kicGroup)
return startMachine(&cc, &n)
}
// ConfigureRuntimes does what needs to happen to get a runtime going.
func configureRuntimes(runner cruntime.CommandRunner, drvName string, k8s config.KubernetesConfig, kv semver.Version) cruntime.Manager {
co := cruntime.Config{
@ -281,15 +298,15 @@ func apiServerURL(h host.Host, cc config.ClusterConfig, n config.Node) (string,
}
// StartMachine starts a VM
func startMachine(cfg *config.ClusterConfig, node *config.Node) (runner command.Runner, preExists bool, machineAPI libmachine.API, host *host.Host) {
func startMachine(cfg *config.ClusterConfig, node *config.Node) (runner command.Runner, preExists bool, machineAPI libmachine.API, host *host.Host, err error) {
m, err := machine.NewAPIClient()
if err != nil {
exit.WithError("Failed to get machine client", err)
return nil, false, nil, nil, errors.Wrap(err, "Failed to get machine client")
}
host, preExists = startHost(m, *cfg, *node)
runner, err = machine.CommandRunner(host)
if err != nil {
exit.WithError("Failed to get command runner", err)
return runner, preExists, m, host, errors.Wrap(err, "Failed to get command runner")
}
ip := validateNetwork(host, runner)
@ -304,10 +321,10 @@ func startMachine(cfg *config.ClusterConfig, node *config.Node) (runner command.
node.IP = ip
err = config.SaveNode(cfg, node)
if err != nil {
exit.WithError("saving node", err)
return nil, false, nil, nil, errors.Wrap(err, "saving node")
}
return runner, preExists, m, host
return runner, preExists, m, host, err
}
// startHost starts a new minikube host using a VM or None