fix timeout when stopping KVM machine with CRI-O container runtime (#19758)

* fix timeout when stopping KVM machine with CRI-O container runtime

* Update pkg/drivers/kvm/kvm.go

Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com>

* Update pkg/drivers/kvm/kvm.go

Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com>

* improve error message

* Update pkg/drivers/kvm/kvm.go

Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com>

---------

Co-authored-by: Steven Powell <44844360+spowelljr@users.noreply.github.com>
Co-authored-by: Medya Ghazizadeh <medyagh@users.noreply.github.com>
pull/20093/head
Predrag Rogic 2024-12-12 19:14:28 +00:00 committed by GitHub
parent 6867769a4d
commit 6ab1d42c0c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 248 additions and 137 deletions

View File

@ -31,7 +31,7 @@ import (
func (d *Driver) getDomain() (*libvirt.Domain, *libvirt.Connect, error) {
conn, err := getConnection(d.ConnectionURI)
if err != nil {
return nil, nil, errors.Wrap(err, "getting domain")
return nil, nil, errors.Wrap(err, "getting libvirt connection")
}
dom, err := conn.LookupDomainByName(d.MachineName)
@ -71,9 +71,13 @@ func (d *Driver) createDomain() (*libvirt.Domain, error) {
}
conn, err := getConnection(d.ConnectionURI)
if err != nil {
return nil, errors.Wrap(err, "error getting libvirt connection")
return nil, errors.Wrap(err, "getting libvirt connection")
}
defer conn.Close()
defer func() {
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
log.Infof("define libvirt domain using xml: %v", domainXML.String())
// define the domain in libvirt using the generated XML

View File

@ -117,35 +117,19 @@ func NewDriver(hostName, storePath string) *Driver {
}
}
// PreCommandCheck checks the connection before issuing a command
func (d *Driver) PreCommandCheck() error {
conn, err := getConnection(d.ConnectionURI)
if err != nil {
return errors.Wrap(err, "error connecting to libvirt socket. Have you added yourself to the libvirtd group?")
}
defer conn.Close()
libVersion, err := conn.GetLibVersion()
if err != nil {
return errors.Wrap(err, "getting libvirt version")
}
log.Debugf("Using libvirt version %d", libVersion)
return nil
}
// GetURL returns a Docker URL inside this host
// e.g. tcp://1.2.3.4:2376
// more info https://github.com/docker/machine/blob/b170508bf44c3405e079e26d5fdffe35a64c6972/libmachine/provision/utils.go#L159_L175
func (d *Driver) GetURL() (string, error) {
if err := d.PreCommandCheck(); err != nil {
return "", errors.Wrap(err, "getting URL, precheck failed")
return "", errors.Wrap(err, "prechecking")
}
ip, err := d.GetIP()
if err != nil {
return "", errors.Wrap(err, "getting URL, could not get IP")
return "", errors.Wrap(err, "getting domain IP")
}
if ip == "" {
return "", nil
}
@ -153,15 +137,37 @@ func (d *Driver) GetURL() (string, error) {
return fmt.Sprintf("tcp://%s:2376", ip), nil
}
// GetState returns the state that the host is in (running, stopped, etc)
func (d *Driver) GetState() (st state.State, err error) {
dom, conn, err := d.getDomain()
// PreCommandCheck checks the connection before issuing a command
func (d *Driver) PreCommandCheck() error {
conn, err := getConnection(d.ConnectionURI)
if err != nil {
return state.None, errors.Wrap(err, "getting connection")
return errors.Wrap(err, "getting libvirt connection")
}
defer func() {
if ferr := closeDomain(dom, conn); ferr != nil {
err = ferr
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
libVersion, err := conn.GetLibVersion()
if err != nil {
return errors.Wrap(err, "getting libvirt version")
}
log.Debugf("using libvirt version %d", libVersion)
return nil
}
// GetState returns the state that the host is in (running, stopped, etc)
func (d *Driver) GetState() (state.State, error) {
dom, conn, err := d.getDomain()
if err != nil {
return state.None, errors.Wrap(err, "getting domain")
}
defer func() {
if err := closeDomain(dom, conn); err != nil {
log.Errorf("unable to close domain: %v", err)
}
}()
@ -169,8 +175,8 @@ func (d *Driver) GetState() (st state.State, err error) {
if err != nil {
return state.None, errors.Wrap(err, "getting domain state")
}
st = machineState(lvs)
return // st, err
return machineState(lvs), nil
}
// machineState converts libvirt state to libmachine state
@ -210,17 +216,22 @@ func machineState(lvs libvirt.DomainState) state.State {
func (d *Driver) GetIP() (string, error) {
s, err := d.GetState()
if err != nil {
return "", errors.Wrap(err, "machine in unknown state")
return "", errors.Wrap(err, "getting domain state")
}
if s != state.Running {
return "", errors.New("host is not running")
return "", errors.New("domain is not running")
}
conn, err := getConnection(d.ConnectionURI)
if err != nil {
return "", errors.Wrap(err, "getting libvirt connection")
}
defer conn.Close()
defer func() {
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
return ipFromXML(conn, d.MachineName, d.PrivateNetwork)
}
@ -236,57 +247,79 @@ func (d *Driver) DriverName() string {
}
// Kill stops a host forcefully, including any containers that we are managing.
func (d *Driver) Kill() (err error) {
func (d *Driver) Kill() error {
s, err := d.GetState()
if err != nil {
return errors.Wrap(err, "getting domain state")
}
if s == state.Stopped {
return nil
}
log.Info("killing domain...")
dom, conn, err := d.getDomain()
if err != nil {
return errors.Wrap(err, "getting connection")
return errors.Wrap(err, "getting domain")
}
defer func() {
if ferr := closeDomain(dom, conn); ferr != nil {
err = ferr
if err := closeDomain(dom, conn); err != nil {
log.Errorf("unable to close domain: %v", err)
}
}()
// ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainDestroy
// "virDomainDestroy first requests that a guest terminate (e.g. SIGTERM), then waits for it to comply.
// After a reasonable timeout, if the guest still exists, virDomainDestroy will forcefully terminate the guest (e.g. SIGKILL)
// if necessary (which may produce undesirable results, for example unflushed disk cache in the guest).
// To avoid this possibility, it's recommended to instead call virDomainDestroyFlags, sending the VIR_DOMAIN_DESTROY_GRACEFUL flag."
// ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainDestroyFlags
// ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainDestroyFlagsValues
// we're using default virDomainDestroyFlags - ie, VIR_DOMAIN_DESTROY_DEFAULT (0), the "Default behavior - could lead to data loss!!"
return dom.Destroy()
}
// Restart a host
func (d *Driver) Restart() error {
log.Info("restarting domain...")
return pkgdrivers.Restart(d)
}
// Start a host
func (d *Driver) Start() (err error) {
func (d *Driver) Start() error {
log.Info("starting domain...")
// this call ensures that all networks are active
log.Info("Ensuring networks are active...")
err = d.ensureNetwork()
if err != nil {
log.Info("ensuring networks are active...")
if err := d.ensureNetwork(); err != nil {
return errors.Wrap(err, "ensuring active networks")
}
log.Info("Getting domain xml...")
log.Info("getting domain XML...")
dom, conn, err := d.getDomain()
if err != nil {
return errors.Wrap(err, "getting connection")
return errors.Wrap(err, "getting domain XML")
}
defer func() {
if ferr := closeDomain(dom, conn); ferr != nil {
err = ferr
if err := closeDomain(dom, conn); err != nil {
log.Errorf("unable to close domain: %v", err)
}
}()
log.Info("Creating domain...")
log.Info("creating domain...")
if err := dom.Create(); err != nil {
return errors.Wrap(err, "error creating VM")
return errors.Wrap(err, "creating domain")
}
log.Info("Waiting to get IP...")
log.Info("waiting for IP...")
if err := d.waitForStaticIP(conn); err != nil {
return errors.Wrap(err, "IP not available after waiting")
return errors.Wrap(err, "waiting for IP")
}
log.Info("Waiting for SSH to be available...")
log.Info("waiting for SSH...")
if err := drivers.WaitForSSH(d); err != nil {
return errors.Wrap(err, "SSH not available after waiting")
return errors.Wrap(err, "waiting for SSH")
}
return nil
@ -297,44 +330,48 @@ func (d *Driver) waitForStaticIP(conn *libvirt.Connect) error {
query := func() error {
sip, err := ipFromAPI(conn, d.MachineName, d.PrivateNetwork)
if err != nil {
return fmt.Errorf("failed getting IP during machine start, will retry: %w", err)
}
if sip == "" {
return fmt.Errorf("waiting for machine to come up")
return errors.Wrap(err, "getting domain IP, will retry")
}
log.Infof("Found IP for machine: %s", sip)
if sip == "" {
return errors.New("waiting for domain to come up")
}
log.Infof("found domain IP: %s", sip)
d.IPAddress = sip
return nil
}
if err := retry.Local(query, 1*time.Minute); err != nil {
return fmt.Errorf("machine %s didn't return IP after 1 minute", d.MachineName)
return fmt.Errorf("domain %s didn't return IP after 1 minute", d.MachineName)
}
log.Info("Reserving static IP address...")
log.Info("reserving static IP address...")
if err := addStaticIP(conn, d.PrivateNetwork, d.MachineName, d.PrivateMAC, d.IPAddress); err != nil {
log.Warnf("Failed reserving static IP %s for host %s, will continue anyway: %v", d.IPAddress, d.MachineName, err)
log.Warnf("failed reserving static IP address %s for domain %s, will continue anyway: %v", d.IPAddress, d.MachineName, err)
} else {
log.Infof("Reserved static IP address: %s", d.IPAddress)
log.Infof("reserved static IP address %s for domain %s", d.IPAddress, d.MachineName)
}
return nil
}
// Create a host using the driver's config
func (d *Driver) Create() (err error) {
log.Info("Creating KVM machine...")
err = d.createNetwork()
if err != nil {
func (d *Driver) Create() error {
log.Info("creating KVM machine...")
log.Info("creating network...")
if err := d.createNetwork(); err != nil {
return errors.Wrap(err, "creating network")
}
if d.GPU {
log.Info("Creating devices...")
d.DevicesXML, err = getDevicesXML()
log.Info("getting devices XML...")
xml, err := getDevicesXML()
if err != nil {
return errors.Wrap(err, "creating devices")
return errors.Wrap(err, "getting devices XML")
}
d.DevicesXML = xml
}
if d.NUMANodeCount > 1 {
@ -346,22 +383,22 @@ func (d *Driver) Create() (err error) {
}
store := d.ResolveStorePath(".")
log.Infof("Setting up store path in %s ...", store)
log.Infof("setting up store path in %s ...", store)
// 0755 because it must be accessible by libvirt/qemu across a variety of configs
if err := os.MkdirAll(store, 0755); err != nil {
return errors.Wrap(err, "creating store")
}
log.Infof("Building disk image from %s", d.Boot2DockerURL)
if err = pkgdrivers.MakeDiskImage(d.BaseDriver, d.Boot2DockerURL, d.DiskSize); err != nil {
return errors.Wrap(err, "error creating disk")
log.Infof("building disk image from %s", d.Boot2DockerURL)
if err := pkgdrivers.MakeDiskImage(d.BaseDriver, d.Boot2DockerURL, d.DiskSize); err != nil {
return errors.Wrap(err, "creating disk")
}
if d.ExtraDisks > 20 {
// Limiting the number of disks to 20 arbitrarily. If more disks are
// needed, the logical name generation has to changed to create them if
// the form hdaa, hdab, etc
return errors.Wrap(err, "cannot create more than 20 extra disks")
return errors.New("cannot create more than 20 extra disks")
}
for i := 0; i < d.ExtraDisks; i++ {
diskpath := pkgdrivers.ExtraDiskPath(d.BaseDriver, i)
@ -381,22 +418,23 @@ func (d *Driver) Create() (err error) {
log.Errorf("unable to ensure permissions on %s: %v", store, err)
}
log.Info("Creating domain...")
log.Info("creating domain...")
dom, err := d.createDomain()
if err != nil {
return errors.Wrap(err, "creating domain")
}
defer func() {
if ferr := dom.Free(); ferr != nil {
log.Warnf("unable to free domain: %v", err)
err = ferr
if err := dom.Free(); err != nil {
log.Errorf("unable to free domain: %v", err)
}
}()
if err = d.Start(); err != nil {
log.Errorf("unable to start VM: %v", err)
return err
if err := d.Start(); err != nil {
return errors.Wrap(err, "starting domain")
}
log.Infof("KVM machine creation complete!")
log.Infof("KVM machine creation complete")
return nil
}
@ -405,19 +443,22 @@ func ensureDirPermissions(store string) error {
// traverse upwards from /home/user/.minikube/machines to ensure
// that libvirt/qemu has execute access
for dir := store; dir != "/"; dir = filepath.Dir(dir) {
log.Debugf("Checking permissions on dir: %s", dir)
log.Debugf("checking permissions on dir: %s", dir)
s, err := os.Stat(dir)
if err != nil {
return err
}
owner := int(s.Sys().(*syscall.Stat_t).Uid)
if owner != os.Geteuid() {
log.Debugf("Skipping %s - not owner", dir)
log.Debugf("skipping %s - not owner", dir)
continue
}
mode := s.Mode()
if mode&0011 != 1 {
log.Infof("Setting executable bit set on %s (perms=%s)", dir, mode)
log.Infof("setting executable bit set on %s (perms=%s)", dir, mode)
mode |= 0011
if err := os.Chmod(dir, mode); err != nil {
return err
@ -427,100 +468,154 @@ func ensureDirPermissions(store string) error {
return nil
}
// Stop a host gracefully
func (d *Driver) Stop() (err error) {
// Stop a host gracefully or forcefully otherwise.
func (d *Driver) Stop() error {
s, err := d.GetState()
if err != nil {
return errors.Wrap(err, "getting state of VM")
return errors.Wrap(err, "getting domain state")
}
if s != state.Stopped {
dom, conn, err := d.getDomain()
defer func() {
if ferr := closeDomain(dom, conn); ferr != nil {
err = ferr
}
}()
if err != nil {
return errors.Wrap(err, "getting connection")
}
err = dom.Shutdown()
if err != nil {
return errors.Wrap(err, "stopping vm")
}
maxsec := 120
for i := 0; i < maxsec; i++ {
s, err := d.GetState()
if err != nil {
return errors.Wrap(err, "error getting state of VM")
}
if s == state.Stopped {
return nil
}
log.Infof("Waiting for machine to stop %d/%d", i, maxsec)
time.Sleep(time.Second)
}
if s == state.Stopped {
return nil
}
return fmt.Errorf("unable to stop vm, current state %q", s.String())
log.Info("stopping domain...")
dom, conn, err := d.getDomain()
if err != nil {
return errors.Wrap(err, "getting domain")
}
defer func() {
if err := closeDomain(dom, conn); err != nil {
log.Errorf("unable to close domain: %v", err)
}
}()
log.Info("gracefully shutting down domain...")
// ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainShutdownFlags
// note: "The order in which the hypervisor tries each shutdown method is undefined, and a hypervisor is not required to support all methods."
// so we skip "VIR_DOMAIN_SHUTDOWN_DEFAULT" - the "hypervisor choice" and use bitwise-OR of available virDomainShutdownFlagValues
// ref: https://libvirt.org/html/libvirt-libvirt-domain.html#virDomainShutdownFlagValues
// ref: https://bugzilla.redhat.com/show_bug.cgi?id=1744156
// "libvirt's QEMU driver, used to manage KVM guests, only supports the 'agent' and 'acpi' reboot/shutdown modes because those are the ones QEMU itself supports"
// note: we don't install/use agent
if err := dom.ShutdownFlags(libvirt.DOMAIN_SHUTDOWN_ACPI_POWER_BTN); err != nil {
log.Warnf("setting virDomainShutdownFlags failed, will continue anyway with defaults: %v", err)
if err := dom.ShutdownFlags(libvirt.DOMAIN_SHUTDOWN_DEFAULT); err != nil {
log.Errorf("resetting virDomainShutdownFlags to defaults failed, will continue anyway: %v", err)
}
}
if err := dom.Shutdown(); err != nil {
return errors.Wrap(err, "gracefully shutting down domain")
}
if s, err = d.waitForStopState(90, "graceful shutdown"); err == nil {
log.Info("domain gracefully shutdown")
return nil
}
// could not get domain state
if s == state.None {
return err
}
// at this point shutdown failed, so we try with a little bit of force
log.Warn("waiting for domain graceful shutdown failed, will try to force-stop")
if err := d.Kill(); err != nil {
log.Warnf("force-stopping domain request failed: %v", err)
}
if s, err := d.waitForStopState(30, "force-stop"); err != nil {
return fmt.Errorf("unable to stop domain %s, current state is %q", d.MachineName, s.String())
}
log.Info("domain force-stopped")
return nil
}
// waitForStopState waits maxsec for the domain to reach a stopped state.
func (d *Driver) waitForStopState(maxsec int, method string) (state.State, error) {
var s state.State
var err error
for i := 0; i < maxsec; i++ {
if s, err = d.GetState(); err != nil {
return s, errors.Wrap(err, "getting domain state")
}
if s == state.Stopped {
return state.Stopped, nil
}
log.Infof("waiting for domain %s %d/%d", method, i, maxsec)
time.Sleep(1 * time.Second)
}
return s, fmt.Errorf("timed out waiting for domain %s, current state is %q", method, s)
}
// Remove a host
func (d *Driver) Remove() error {
log.Debug("Removing machine...")
log.Info("removing KVM machine...")
conn, err := getConnection(d.ConnectionURI)
if err != nil {
return errors.Wrap(err, "getting connection")
return errors.Wrap(err, "getting libvirt connection")
}
defer conn.Close()
defer func() {
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
// Tear down network if it exists and is not in use by another minikube instance
log.Debug("Trying to delete the networks (if possible)")
log.Info("deleting networks...")
if err := d.deleteNetwork(); err != nil {
log.Warnf("Deleting of networks failed: %v", err)
log.Errorf("deleting networks failed, will continue anyway: %v", err)
} else {
log.Info("Successfully deleted networks")
log.Info("successfully deleted networks")
}
// Tear down the domain now
log.Debug("Checking if the domain needs to be deleted")
log.Info("checking if the domain needs to be deleted")
dom, err := conn.LookupDomainByName(d.MachineName)
if err != nil {
log.Warnf("Domain %s does not exist, nothing to clean up...", d.MachineName)
log.Warnf("domain %s does not exist, nothing to clean up...", d.MachineName)
return nil
}
log.Infof("Domain %s exists, removing...", d.MachineName)
log.Infof("domain %s exists, removing...", d.MachineName)
if err := d.destroyRunningDomain(dom); err != nil {
return errors.Wrap(err, "destroying running domain")
}
if err := d.undefineDomain(conn, dom); err != nil {
return errors.Wrap(err, "undefine domain")
return errors.Wrap(err, "undefining domain")
}
log.Info("Removing static IP address...")
log.Info("removing static IP address...")
if err := delStaticIP(conn, d.PrivateNetwork, "", "", d.IPAddress); err != nil {
log.Warnf("failed removing static IP %s for host %s, will continue anyway: %v", d.IPAddress, d.MachineName, err)
log.Warnf("failed removing static IP address %s for domain %s, will continue anyway: %v", d.IPAddress, d.MachineName, err)
} else {
log.Info("Removed static IP address")
log.Infof("removed static IP address %s for domain %s", d.IPAddress, d.MachineName)
}
log.Infof("KVM machine removal complete")
return nil
}
func (d *Driver) destroyRunningDomain(dom *libvirt.Domain) error {
state, _, err := dom.GetState()
lvs, _, err := dom.GetState()
if err != nil {
return errors.Wrap(err, "getting domain state")
}
// if the domain is not running, we don't destroy it
if state != libvirt.DOMAIN_RUNNING {
log.Warnf("Domain %s already destroyed, skipping...", d.MachineName)
if machineState(lvs) != state.Running {
log.Warnf("domain %s already destroyed, skipping...", d.MachineName)
return nil
}
@ -530,7 +625,7 @@ func (d *Driver) destroyRunningDomain(dom *libvirt.Domain) error {
func (d *Driver) undefineDomain(conn *libvirt.Connect, dom *libvirt.Domain) error {
definedDomains, err := conn.ListDefinedDomains()
if err != nil {
return errors.Wrap(err, "list defined domains")
return errors.Wrap(err, "listing domains")
}
var found bool
@ -542,7 +637,7 @@ func (d *Driver) undefineDomain(conn *libvirt.Connect, dom *libvirt.Domain) erro
}
if !found {
log.Warnf("Domain %s not defined, skipping undefine...", d.MachineName)
log.Warnf("domain %s not defined, skipping undefine...", d.MachineName)
return nil
}

View File

@ -118,7 +118,11 @@ func (d *Driver) ensureNetwork() error {
if err != nil {
return errors.Wrap(err, "getting libvirt connection")
}
defer conn.Close()
defer func() {
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
// network: default
@ -162,7 +166,11 @@ func (d *Driver) createNetwork() error {
if err != nil {
return errors.Wrap(err, "getting libvirt connection")
}
defer conn.Close()
defer func() {
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
// network: default
// It is assumed that the libvirt/kvm installation has already created this network
@ -240,7 +248,11 @@ func (d *Driver) deleteNetwork() error {
if err != nil {
return errors.Wrap(err, "getting libvirt connection")
}
defer conn.Close()
defer func() {
if _, err := conn.Close(); err != nil {
log.Errorf("unable to close libvirt connection: %v", err)
}
}()
// network: default
// It is assumed that the OS manages this network