2018-10-01 11:08:33 +00:00
|
|
|
package tsi1_test
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"io/ioutil"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"testing"
|
|
|
|
|
2019-01-08 00:37:16 +00:00
|
|
|
"github.com/influxdata/influxdb/tsdb"
|
|
|
|
"github.com/influxdata/influxdb/tsdb/tsi1"
|
2018-10-01 11:08:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
func TestPartition_Open(t *testing.T) {
|
|
|
|
sfile := MustOpenSeriesFile()
|
|
|
|
defer sfile.Close()
|
|
|
|
|
|
|
|
// Opening a fresh index should set the MANIFEST version to current version.
|
|
|
|
p := NewPartition(sfile.SeriesFile)
|
|
|
|
t.Run("open new index", func(t *testing.T) {
|
|
|
|
if err := p.Open(); err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
storage: fix problems with keeping resources alive
This commit adds the pkg/lifecycle.Resource to help manage opening,
closing, and leasing out references to some resource. A resource
cannot be closed until all acquired references have been released.
If the debug_ref tag is enabled, all resource acquisitions keep
track of the stack trace that created them and have a finalizer
associated with them to print on stderr if they are leaked. It also
registers a handler on SIGUSR2 to dump all of the currently live
resources.
Having resources tracked in a uniform way with a data type allows us
to do more sophisticated tracking with the debug_ref tag, as well.
For example, we could panic the process if a resource cannot be
closed within a certain time frame, or attempt to figure out the
DAG of resource ownership dynamically.
This commit also fixes many issues around resources, correctness
during error scenarios, reporting of errors, idempotency of
close, tracking of memory for some data structures, resource leaks
in tests, and out of order dependency closes in tests.
2019-02-25 23:51:08 +00:00
|
|
|
fs, err := p.FileSet()
|
|
|
|
if err != nil {
|
tsi1: partition close deadlock
When a tsi1 partition closes, it waits on the wait group for compactions
and then acquires the lock. Unfortunately, a compaction may start in the
mean time, holding on to some resources. Then, close will attempt to
close those resources while holding the lock. That will block until
the compaction has finished, but it also needs to acquire the lock
in order to finish, leading to deadlock.
One cannot just move the wait group wait into the lock because, once
again, the compaction must acquire the lock before finishing. Compaction
can't finish before acquiring the lock because then it might be operating
on an invalid resource.
This change splits the locks into two: one to protect just against
concurrent Open and Close calls, and one to protect all of the other
state. We then just close the partition, acquire the lock, then free
the resources. Starting a compaction requires acquiring a resource
to the partition itself, so that it can't start one after it has
started closing.
This change also introduces a cancellation channel into a reference
to a resource that is closed when the resource is being closed, allowing
processes that have acquired a reference to clean up quicker if someone
is trying to close the resource.
2019-04-02 19:00:56 +00:00
|
|
|
p.Close()
|
storage: fix problems with keeping resources alive
This commit adds the pkg/lifecycle.Resource to help manage opening,
closing, and leasing out references to some resource. A resource
cannot be closed until all acquired references have been released.
If the debug_ref tag is enabled, all resource acquisitions keep
track of the stack trace that created them and have a finalizer
associated with them to print on stderr if they are leaked. It also
registers a handler on SIGUSR2 to dump all of the currently live
resources.
Having resources tracked in a uniform way with a data type allows us
to do more sophisticated tracking with the debug_ref tag, as well.
For example, we could panic the process if a resource cannot be
closed within a certain time frame, or attempt to figure out the
DAG of resource ownership dynamically.
This commit also fixes many issues around resources, correctness
during error scenarios, reporting of errors, idempotency of
close, tracking of memory for some data structures, resource leaks
in tests, and out of order dependency closes in tests.
2019-02-25 23:51:08 +00:00
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
defer fs.Release()
|
|
|
|
|
2018-10-01 11:08:33 +00:00
|
|
|
// Check version set appropriately.
|
storage: fix problems with keeping resources alive
This commit adds the pkg/lifecycle.Resource to help manage opening,
closing, and leasing out references to some resource. A resource
cannot be closed until all acquired references have been released.
If the debug_ref tag is enabled, all resource acquisitions keep
track of the stack trace that created them and have a finalizer
associated with them to print on stderr if they are leaked. It also
registers a handler on SIGUSR2 to dump all of the currently live
resources.
Having resources tracked in a uniform way with a data type allows us
to do more sophisticated tracking with the debug_ref tag, as well.
For example, we could panic the process if a resource cannot be
closed within a certain time frame, or attempt to figure out the
DAG of resource ownership dynamically.
This commit also fixes many issues around resources, correctness
during error scenarios, reporting of errors, idempotency of
close, tracking of memory for some data structures, resource leaks
in tests, and out of order dependency closes in tests.
2019-02-25 23:51:08 +00:00
|
|
|
if got, exp := p.Manifest(fs).Version, 1; got != exp {
|
tsi1: partition close deadlock
When a tsi1 partition closes, it waits on the wait group for compactions
and then acquires the lock. Unfortunately, a compaction may start in the
mean time, holding on to some resources. Then, close will attempt to
close those resources while holding the lock. That will block until
the compaction has finished, but it also needs to acquire the lock
in order to finish, leading to deadlock.
One cannot just move the wait group wait into the lock because, once
again, the compaction must acquire the lock before finishing. Compaction
can't finish before acquiring the lock because then it might be operating
on an invalid resource.
This change splits the locks into two: one to protect just against
concurrent Open and Close calls, and one to protect all of the other
state. We then just close the partition, acquire the lock, then free
the resources. Starting a compaction requires acquiring a resource
to the partition itself, so that it can't start one after it has
started closing.
This change also introduces a cancellation channel into a reference
to a resource that is closed when the resource is being closed, allowing
processes that have acquired a reference to clean up quicker if someone
is trying to close the resource.
2019-04-02 19:00:56 +00:00
|
|
|
p.Close()
|
2018-10-01 11:08:33 +00:00
|
|
|
t.Fatalf("got index version %d, expected %d", got, exp)
|
|
|
|
}
|
|
|
|
})
|
tsi1: partition close deadlock
When a tsi1 partition closes, it waits on the wait group for compactions
and then acquires the lock. Unfortunately, a compaction may start in the
mean time, holding on to some resources. Then, close will attempt to
close those resources while holding the lock. That will block until
the compaction has finished, but it also needs to acquire the lock
in order to finish, leading to deadlock.
One cannot just move the wait group wait into the lock because, once
again, the compaction must acquire the lock before finishing. Compaction
can't finish before acquiring the lock because then it might be operating
on an invalid resource.
This change splits the locks into two: one to protect just against
concurrent Open and Close calls, and one to protect all of the other
state. We then just close the partition, acquire the lock, then free
the resources. Starting a compaction requires acquiring a resource
to the partition itself, so that it can't start one after it has
started closing.
This change also introduces a cancellation channel into a reference
to a resource that is closed when the resource is being closed, allowing
processes that have acquired a reference to clean up quicker if someone
is trying to close the resource.
2019-04-02 19:00:56 +00:00
|
|
|
if t.Failed() {
|
|
|
|
return
|
|
|
|
}
|
2018-10-01 11:08:33 +00:00
|
|
|
|
|
|
|
// Reopening an open index should return an error.
|
|
|
|
t.Run("reopen open index", func(t *testing.T) {
|
|
|
|
err := p.Open()
|
|
|
|
if err == nil {
|
|
|
|
p.Close()
|
|
|
|
t.Fatal("didn't get an error on reopen, but expected one")
|
|
|
|
}
|
|
|
|
p.Close()
|
|
|
|
})
|
tsi1: partition close deadlock
When a tsi1 partition closes, it waits on the wait group for compactions
and then acquires the lock. Unfortunately, a compaction may start in the
mean time, holding on to some resources. Then, close will attempt to
close those resources while holding the lock. That will block until
the compaction has finished, but it also needs to acquire the lock
in order to finish, leading to deadlock.
One cannot just move the wait group wait into the lock because, once
again, the compaction must acquire the lock before finishing. Compaction
can't finish before acquiring the lock because then it might be operating
on an invalid resource.
This change splits the locks into two: one to protect just against
concurrent Open and Close calls, and one to protect all of the other
state. We then just close the partition, acquire the lock, then free
the resources. Starting a compaction requires acquiring a resource
to the partition itself, so that it can't start one after it has
started closing.
This change also introduces a cancellation channel into a reference
to a resource that is closed when the resource is being closed, allowing
processes that have acquired a reference to clean up quicker if someone
is trying to close the resource.
2019-04-02 19:00:56 +00:00
|
|
|
if t.Failed() {
|
|
|
|
return
|
|
|
|
}
|
2018-10-01 11:08:33 +00:00
|
|
|
|
|
|
|
// Opening an incompatible index should return an error.
|
|
|
|
incompatibleVersions := []int{-1, 0, 2}
|
|
|
|
for _, v := range incompatibleVersions {
|
|
|
|
t.Run(fmt.Sprintf("incompatible index version: %d", v), func(t *testing.T) {
|
|
|
|
p = NewPartition(sfile.SeriesFile)
|
|
|
|
// Manually create a MANIFEST file for an incompatible index version.
|
|
|
|
mpath := filepath.Join(p.Path(), tsi1.ManifestFileName)
|
|
|
|
m := tsi1.NewManifest(mpath)
|
|
|
|
m.Levels = nil
|
|
|
|
m.Version = v // Set example MANIFEST version.
|
|
|
|
if _, err := m.Write(); err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Log the MANIFEST file.
|
|
|
|
data, err := ioutil.ReadFile(mpath)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
t.Logf("Incompatible MANIFEST: %s", data)
|
|
|
|
|
|
|
|
// Opening this index should return an error because the MANIFEST has an
|
|
|
|
// incompatible version.
|
|
|
|
err = p.Open()
|
|
|
|
if err != tsi1.ErrIncompatibleVersion {
|
|
|
|
p.Close()
|
|
|
|
t.Fatalf("got error %v, expected %v", err, tsi1.ErrIncompatibleVersion)
|
|
|
|
}
|
|
|
|
})
|
tsi1: partition close deadlock
When a tsi1 partition closes, it waits on the wait group for compactions
and then acquires the lock. Unfortunately, a compaction may start in the
mean time, holding on to some resources. Then, close will attempt to
close those resources while holding the lock. That will block until
the compaction has finished, but it also needs to acquire the lock
in order to finish, leading to deadlock.
One cannot just move the wait group wait into the lock because, once
again, the compaction must acquire the lock before finishing. Compaction
can't finish before acquiring the lock because then it might be operating
on an invalid resource.
This change splits the locks into two: one to protect just against
concurrent Open and Close calls, and one to protect all of the other
state. We then just close the partition, acquire the lock, then free
the resources. Starting a compaction requires acquiring a resource
to the partition itself, so that it can't start one after it has
started closing.
This change also introduces a cancellation channel into a reference
to a resource that is closed when the resource is being closed, allowing
processes that have acquired a reference to clean up quicker if someone
is trying to close the resource.
2019-04-02 19:00:56 +00:00
|
|
|
if t.Failed() {
|
|
|
|
return
|
|
|
|
}
|
2018-10-01 11:08:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestPartition_Manifest(t *testing.T) {
|
|
|
|
t.Run("current MANIFEST", func(t *testing.T) {
|
|
|
|
sfile := MustOpenSeriesFile()
|
|
|
|
defer sfile.Close()
|
|
|
|
|
|
|
|
p := MustOpenPartition(sfile.SeriesFile)
|
storage: fix problems with keeping resources alive
This commit adds the pkg/lifecycle.Resource to help manage opening,
closing, and leasing out references to some resource. A resource
cannot be closed until all acquired references have been released.
If the debug_ref tag is enabled, all resource acquisitions keep
track of the stack trace that created them and have a finalizer
associated with them to print on stderr if they are leaked. It also
registers a handler on SIGUSR2 to dump all of the currently live
resources.
Having resources tracked in a uniform way with a data type allows us
to do more sophisticated tracking with the debug_ref tag, as well.
For example, we could panic the process if a resource cannot be
closed within a certain time frame, or attempt to figure out the
DAG of resource ownership dynamically.
This commit also fixes many issues around resources, correctness
during error scenarios, reporting of errors, idempotency of
close, tracking of memory for some data structures, resource leaks
in tests, and out of order dependency closes in tests.
2019-02-25 23:51:08 +00:00
|
|
|
defer p.Close()
|
|
|
|
|
|
|
|
fs, err := p.FileSet()
|
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
defer fs.Release()
|
|
|
|
|
|
|
|
if got, exp := p.Manifest(fs).Version, tsi1.Version; got != exp {
|
2018-10-01 11:08:33 +00:00
|
|
|
t.Fatalf("got MANIFEST version %d, expected %d", got, exp)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// Partition is a test wrapper for tsi1.Partition.
|
|
|
|
type Partition struct {
|
|
|
|
*tsi1.Partition
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewPartition returns a new instance of Partition at a temporary path.
|
|
|
|
func NewPartition(sfile *tsdb.SeriesFile) *Partition {
|
|
|
|
return &Partition{Partition: tsi1.NewPartition(sfile, MustTempPartitionDir())}
|
|
|
|
}
|
|
|
|
|
|
|
|
// MustOpenPartition returns a new, open index. Panic on error.
|
|
|
|
func MustOpenPartition(sfile *tsdb.SeriesFile) *Partition {
|
|
|
|
p := NewPartition(sfile)
|
|
|
|
if err := p.Open(); err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return p
|
|
|
|
}
|
|
|
|
|
|
|
|
// Close closes and removes the index directory.
|
|
|
|
func (p *Partition) Close() error {
|
|
|
|
defer os.RemoveAll(p.Path())
|
|
|
|
return p.Partition.Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reopen closes and opens the index.
|
|
|
|
func (p *Partition) Reopen() error {
|
|
|
|
if err := p.Partition.Close(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
sfile, path := p.SeriesFile(), p.Path()
|
|
|
|
p.Partition = tsi1.NewPartition(sfile, path)
|
|
|
|
return p.Open()
|
|
|
|
}
|