Clarify best method of set difference

2019-01-11 19:27:53 +00:00 · 2019-01-11 19:27:53 +00:00 · 7ee4f499e1
parent c7d26d8950
commit 7ee4f499e1
4 changed files with 143 additions and 9 deletions
--- a/tsdb/index.go
+++ b/tsdb/index.go
@ -609,7 +609,7 @@ func DifferenceSeriesIDIterators(itr0, itr1 SeriesIDIterator) SeriesIDIterator {
 	if a := NewSeriesIDSetIterators([]SeriesIDIterator{itr0, itr1}); a != nil {
 		itr0.Close()
 		itr1.Close()
-		return NewSeriesIDSetIterator(a[0].SeriesIDSet().AndNot(a[1].SeriesIDSet()))
+		return NewSeriesIDSetIterator(NewSeriesIDSetNegate(a[0].SeriesIDSet(), a[1].SeriesIDSet()))
 	}

 	return &seriesIDDifferenceIterator{itrs: [2]SeriesIDIterator{itr0, itr1}}
--- a/tsdb/series_set.go
+++ b/tsdb/series_set.go
@ -27,6 +27,17 @@ func NewSeriesIDSet(a ...SeriesID) *SeriesIDSet {
 	return ss
 }

+// NewSeriesIDSetNegate returns a new SeriesIDSet containing all the elements in a
+// that are not present in b. That is, the set difference between a and b.
+func NewSeriesIDSetNegate(a, b *SeriesIDSet) *SeriesIDSet {
+	a.RLock()
+	defer a.RUnlock()
+	b.RLock()
+	defer b.RUnlock()
+
+	return &SeriesIDSet{bitmap: roaring.AndNot(a.bitmap, b.bitmap)}
+}
+
 // Bytes estimates the memory footprint of this SeriesIDSet, in bytes.
 func (s *SeriesIDSet) Bytes() int {
 	var b int
@ -170,15 +181,13 @@ func (s *SeriesIDSet) And(other *SeriesIDSet) *SeriesIDSet {
 	return &SeriesIDSet{bitmap: roaring.And(s.bitmap, other.bitmap)}
 }

-// AndNot returns a new SeriesIDSet containing elements that were present in s,
-// but not present in other.
-func (s *SeriesIDSet) AndNot(other *SeriesIDSet) *SeriesIDSet {
+// RemoveSet removes all values in other from s, if they exist.
+func (s *SeriesIDSet) RemoveSet(other *SeriesIDSet) {
 	s.RLock()
 	defer s.RUnlock()
 	other.RLock()
 	defer other.RUnlock()
-
-	return &SeriesIDSet{bitmap: roaring.AndNot(s.bitmap, other.bitmap)}
+	s.bitmap.AndNot(other.bitmap)
 }

 // ForEach calls f for each id in the set. The function is applied to the IDs
--- a/tsdb/series_set_test.go
+++ b/tsdb/series_set_test.go
@ -10,7 +10,7 @@ import (
 	"testing"
 )

-func TestSeriesIDSet_AndNot(t *testing.T) {
+func TestSeriesIDSet_NewSeriesIDSetNegate(t *testing.T) {
 	examples := [][3][]uint64{
 		[3][]uint64{
 			{1, 10, 20, 30},
@ -55,7 +55,7 @@ func TestSeriesIDSet_AndNot(t *testing.T) {
 				expected.Add(NewSeriesID(v))
 			}

-			got := a.AndNot(b)
+			got := NewSeriesIDSetNegate(a, b)
 			if got.String() != expected.String() {
 				t.Fatalf("got %s, expected %s", got.String(), expected.String())
 			}
@ -63,6 +63,59 @@ func TestSeriesIDSet_AndNot(t *testing.T) {
 	}
 }

+func TestSeriesIDSet_RemoveSet(t *testing.T) {
+	examples := [][3][]uint64{
+		[3][]uint64{
+			{1, 10, 20, 30},
+			{10, 12, 13, 14, 20},
+			{1, 30},
+		},
+		[3][]uint64{
+			{},
+			{10},
+			{},
+		},
+		[3][]uint64{
+			{1, 10, 20, 30},
+			{1, 10, 20, 30},
+			{},
+		},
+		[3][]uint64{
+			{1, 10},
+			{1, 10, 100},
+			{},
+		},
+		[3][]uint64{
+			{1, 10},
+			{},
+			{1, 10},
+		},
+	}
+
+	for i, example := range examples {
+		t.Run(fmt.Sprint(i), func(t *testing.T) {
+			// Build sets.
+			a, b := NewSeriesIDSet(), NewSeriesIDSet()
+			for _, v := range example[0] {
+				a.Add(NewSeriesID(v))
+			}
+			for _, v := range example[1] {
+				b.Add(NewSeriesID(v))
+			}
+
+			expected := NewSeriesIDSet()
+			for _, v := range example[2] {
+				expected.Add(NewSeriesID(v))
+			}
+
+			a.RemoveSet(b)
+			if a.String() != expected.String() {
+				t.Fatalf("got %s, expected %s", a.String(), expected.String())
+			}
+		})
+	}
+}
+
 // Ensure that cloning is race-free.
 func TestSeriesIDSet_Clone_Race(t *testing.T) {
 	main := NewSeriesIDSet()
@ -556,6 +609,78 @@ func BenchmarkSeriesIDSet_Remove(b *testing.B) {
 	})
 }

+// BenchmarkSeriesIDSet_MassRemove benchmarks the cost of removing a large set of values.
+func BenchmarkSeriesIDSet_MassRemove(b *testing.B) {
+	var size = uint64(1000000)
+	// Setup...
+	set = NewSeriesIDSet()
+	for i := uint64(0); i < size; i++ {
+		set.Add(NewSeriesID(i))
+	}
+
+	// Remove one at a time
+	b.Run(fmt.Sprint("cardinality_1000000_remove_each"), func(b *testing.B) {
+		clone := set.Clone()
+		for i := 0; i < b.N; i++ {
+			for j := uint64(0); j < size/2; j++ {
+				clone.RemoveNoLock(NewSeriesID(j))
+			}
+
+			b.StopTimer()
+			clone = set.Clone()
+			b.StartTimer()
+		}
+	})
+
+	// This is the case where a target series id set exists.
+	b.Run(fmt.Sprint("cardinality_1000000_remove_set_exists"), func(b *testing.B) {
+		clone := set.Clone()
+		other := NewSeriesIDSet()
+		for j := uint64(0); j < size/2; j++ {
+			other.AddNoLock(NewSeriesID(j))
+		}
+
+		for i := 0; i < b.N; i++ {
+			clone.RemoveSet(other)
+			b.StopTimer()
+			clone = set.Clone()
+			b.StartTimer()
+		}
+	})
+
+	// Make a target series id set and negate it
+	b.Run(fmt.Sprint("cardinality_1000000_remove_set"), func(b *testing.B) {
+		clone := set.Clone()
+		for i := 0; i < b.N; i++ {
+			other := NewSeriesIDSet()
+			for j := uint64(0); j < size/2; j++ {
+				other.AddNoLock(NewSeriesID(j))
+			}
+
+			clone.RemoveSet(other)
+			b.StopTimer()
+			clone = set.Clone()
+			b.StartTimer()
+		}
+	})
+
+	// This is the case where a new result set is created.
+	b.Run(fmt.Sprint("cardinality_1000000_remove_set_new"), func(b *testing.B) {
+		clone := set.Clone()
+		other := NewSeriesIDSet()
+		for j := uint64(0); j < size/2; j++ {
+			other.AddNoLock(NewSeriesID(j))
+		}
+
+		for i := 0; i < b.N; i++ {
+			_ = NewSeriesIDSetNegate(clone, other)
+			b.StopTimer()
+			clone = set.Clone()
+			b.StartTimer()
+		}
+	})
+}
+
 // Typical benchmarks for a laptop:
 //
 // BenchmarkSeriesIDSet_Merge_Duplicates/cardinality_1/shards_1-4         	  200000	      8095 ns/op	   16656 B/op	      11 allocs/op
--- a/tsdb/tsi1/file_set.go
+++ b/tsdb/tsi1/file_set.go
@ -387,7 +387,7 @@ func (fs *FileSet) TagValueSeriesIDIterator(name, key, value []byte) (tsdb.Serie

 		// Remove tombstones set in previous file.
 		if ftss != nil && ftss.Cardinality() > 0 {
-			ss = ss.AndNot(ftss)
+			ss.RemoveSet(ftss)
 		}

 		// Fetch tag value series set for this file and merge into overall set.