influxdb/tsdb/engine/tsm1/wal.go

793 lines
21 KiB
Go

package tsm1
import (
"encoding/json"
"fmt"
"io"
"log"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/influxdb/influxdb/models"
"github.com/influxdb/influxdb/tsdb"
"github.com/golang/snappy"
)
const (
// DefaultSegmentSize of 2MB is the size at which segment files will be rolled over
DefaultSegmentSize = 2 * 1024 * 1024
// FileExtension is the file extension we expect for wal segments
WALFileExtension = "wal"
WALFilePrefix = "_"
writeBufLen = 32 << 10 // 32kb
)
// flushType indiciates why a flush and compaction are being run so the partition can
// do the appropriate type of compaction
type flushType int
const (
// noFlush indicates that no flush or compaction are necesssary at this time
noFlush flushType = iota
// memoryFlush indicates that we should look for the series using the most
// memory to flush out and compact all others
memoryFlush
// idleFlush indicates that we should flush all series in the parition,
// delete all segment files and hold off on opening a new one
idleFlush
// deleteFlush indicates that we're flushing because series need to be removed from the WAL
deleteFlush
// startupFlush indicates that we're flushing because the database is starting up
startupFlush
)
// walEntry is a byte written to a wal segment file that indicates what the following compressed block contains
type walEntryType byte
const (
pointsEntry walEntryType = 0x01
fieldsEntry walEntryType = 0x02
seriesEntry walEntryType = 0x03
deleteEntry walEntryType = 0x04
)
type Log struct {
path string
flushCheckTimer *time.Timer // check this often to see if a background flush should happen
flushCheckInterval time.Duration
// write variables
writeLock sync.Mutex
currentSegmentID int
currentSegmentFile *os.File
currentSegmentSize int
// cache and flush variables
cacheLock sync.RWMutex
lastWriteTime time.Time
flushRunning bool
cache map[string]Values
cacheDirtySort map[string]bool // this map should be small, only for dirty vals
flushCache map[string]Values // temporary map while flushing
memorySize int
measurementFieldsCache map[string]*tsdb.MeasurementFields
seriesToCreateCache []*tsdb.SeriesCreate
// LogOutput is the writer used by the logger.
LogOutput io.Writer
logger *log.Logger
// FlushColdInterval is the period of time after which a partition will do a
// full flush and compaction if it has been cold for writes.
FlushColdInterval time.Duration
// SegmentSize is the file size at which a segment file will be rotated
SegmentSize int
// FlushMemorySizeThreshold specifies when the log should be forced to be flushed
FlushMemorySizeThreshold int
// MaxMemorySizeThreshold specifies the limit at which writes to the WAL should be rejected
MaxMemorySizeThreshold int
// Index is the database series will be flushed to
Index IndexWriter
// LoggingEnabled specifies if detailed logs should be output
LoggingEnabled bool
// SkipCache specifies if the wal should immediately write to the index instead of
// caching data in memory. False by default so we buffer in memory before flushing to index.
SkipCache bool
// SkipDurability specifies if the wal should not write the wal entries to disk.
// False by default which means all writes are durable even when cached before flushing to index.
SkipDurability bool
}
// IndexWriter is an interface for the indexed database the WAL flushes data to
type IndexWriter interface {
Write(valuesByKey map[string]Values, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error
MarkDeletes(keys []string)
MarkMeasurementDelete(name string)
}
func NewLog(path string) *Log {
return &Log{
path: path,
// these options should be overriden by any options in the config
LogOutput: os.Stderr,
FlushColdInterval: tsdb.DefaultFlushColdInterval,
SegmentSize: DefaultSegmentSize,
FlushMemorySizeThreshold: tsdb.DefaultFlushMemorySizeThreshold,
MaxMemorySizeThreshold: tsdb.DefaultMaxMemorySizeThreshold,
logger: log.New(os.Stderr, "[tsm1wal] ", log.LstdFlags),
}
}
// Open opens and initializes the Log. Will recover from previous unclosed shutdowns
func (l *Log) Open() error {
if l.LoggingEnabled {
l.logger.Printf("tsm1 WAL starting with %d flush memory size threshold and %d max memory size threshold\n", l.FlushMemorySizeThreshold, l.MaxMemorySizeThreshold)
l.logger.Printf("tsm1 WAL writing to %s\n", l.path)
}
if err := os.MkdirAll(l.path, 0777); err != nil {
return err
}
l.cache = make(map[string]Values)
l.cacheDirtySort = make(map[string]bool)
l.measurementFieldsCache = make(map[string]*tsdb.MeasurementFields)
// flush out any WAL entries that are there from before
if err := l.readAndFlushWAL(); err != nil {
return err
}
return nil
}
// Cursor will return a cursor object to Seek and iterate with Next for the WAL cache for the given.
// This should only ever be called by the engine cursor method, which will always give it
// exactly one field.
func (l *Log) Cursor(series string, fields []string, dec *tsdb.FieldCodec, ascending bool) tsdb.Cursor {
l.cacheLock.RLock()
defer l.cacheLock.RUnlock()
if len(fields) != 1 {
panic("wal cursor should only ever be called with 1 field")
}
ck := SeriesFieldKey(series, fields[0])
values := l.cache[ck]
// if we're in the middle of a flush, combine the previous cache
// with this one for the cursor
if l.flushCache != nil {
if fc, ok := l.flushCache[ck]; ok {
c := make([]Value, len(fc), len(fc)+len(values))
copy(c, fc)
c = append(c, values...)
return newWALCursor(Values(c).Deduplicate(), ascending)
}
}
if l.cacheDirtySort[ck] {
values = Values(values).Deduplicate()
}
// build a copy so writes afterwards don't change the result set
a := make([]Value, len(values))
copy(a, values)
return newWALCursor(a, ascending)
}
func (l *Log) WritePoints(points []models.Point, fields map[string]*tsdb.MeasurementFields, series []*tsdb.SeriesCreate) error {
// add everything to the cache, or return an error if we've hit our max memory
if addedToCache := l.addToCache(points, fields, series, true); !addedToCache {
return fmt.Errorf("WAL backed up flushing to index, hit max memory")
}
// make the write durable if specified
if !l.SkipDurability {
// write the points
pointStrings := make([]string, len(points))
for i, p := range points {
pointStrings[i] = p.String()
}
data := strings.Join(pointStrings, "\n")
compressed := snappy.Encode(nil, []byte(data))
if err := l.writeToLog(pointsEntry, compressed); err != nil {
return err
}
// write the new fields
if len(fields) > 0 {
data, err := json.Marshal(fields)
if err != nil {
return err
}
compressed = snappy.Encode(compressed, data)
if err := l.writeToLog(fieldsEntry, compressed); err != nil {
return err
}
}
// write the new series
if len(series) > 0 {
data, err := json.Marshal(series)
if err != nil {
return err
}
compressed = snappy.Encode(compressed, data)
if err := l.writeToLog(seriesEntry, compressed); err != nil {
return err
}
}
}
// usually skipping the cache is only for testing purposes and this was the easiest
// way to represent the logic (to cache and then immediately flush)
if l.SkipCache {
l.flush(idleFlush)
}
return nil
}
// addToCache will add the points, measurements, and fields to the cache and return true if successful. They will be queryable
// immediately after return and will be flushed at the next flush cycle. Before adding to the cache we check if we're over the
// max memory threshold. If we are we request a flush in a new goroutine and return false, indicating we didn't add the values
// to the cache and that writes should return a failure.
func (l *Log) addToCache(points []models.Point, fields map[string]*tsdb.MeasurementFields, series []*tsdb.SeriesCreate, checkMemory bool) bool {
l.cacheLock.Lock()
defer l.cacheLock.Unlock()
// if we should check memory and we're over the threshold, mark a flush as running and kick one off in a goroutine
if checkMemory && l.memorySize > l.FlushMemorySizeThreshold {
if !l.flushRunning {
l.flushRunning = true
go l.flush(memoryFlush)
}
if l.memorySize > l.MaxMemorySizeThreshold {
return false
}
}
for _, p := range points {
for name, value := range p.Fields() {
k := SeriesFieldKey(string(p.Key()), name)
v := NewValue(p.Time(), value)
cacheValues := l.cache[k]
// only mark it as dirty if it isn't already
if _, ok := l.cacheDirtySort[k]; !ok && len(cacheValues) > 0 {
dirty := cacheValues[len(cacheValues)-1].Time().UnixNano() >= v.Time().UnixNano()
if dirty {
l.cacheDirtySort[k] = true
}
}
l.memorySize += v.Size()
l.cache[k] = append(cacheValues, v)
}
}
for k, v := range fields {
l.measurementFieldsCache[k] = v
}
l.seriesToCreateCache = append(l.seriesToCreateCache, series...)
l.lastWriteTime = time.Now()
return true
}
func (l *Log) LastWriteTime() time.Time {
l.cacheLock.RLock()
defer l.cacheLock.RUnlock()
return l.lastWriteTime
}
// readAndFlushWAL is called on open and will read the segment files in, flushing whenever
// the memory gets over the limit. Once all files have been read it will flush and remove the files
func (l *Log) readAndFlushWAL() error {
files, err := l.segmentFileNames()
if err != nil {
return err
}
// read all the segment files and cache them, flushing along the way if we
// hit memory limits
for _, fn := range files {
if err := l.readFileToCache(fn); err != nil {
return err
}
if l.memorySize > l.MaxMemorySizeThreshold {
if err := l.flush(memoryFlush); err != nil {
return err
}
}
}
// now flush and remove all the old files
if err := l.flush(startupFlush); err != nil {
return err
}
return nil
}
func (l *Log) readFileToCache(fileName string) error {
f, err := os.OpenFile(fileName, os.O_RDONLY, 0666)
if err != nil {
return err
}
defer f.Close()
buf := make([]byte, writeBufLen)
data := make([]byte, writeBufLen)
for {
// read the type and the length of the entry
_, err := io.ReadFull(f, buf[0:5])
if err == io.EOF {
return nil
} else if err != nil {
l.logger.Printf("error reading segment file %s: %s", fileName, err.Error())
return err
}
entryType := buf[0]
length := btou32(buf[1:5])
// read the compressed block and decompress it
if int(length) > len(buf) {
buf = make([]byte, length)
}
_, err = io.ReadFull(f, buf[0:length])
if err == io.EOF || err == io.ErrUnexpectedEOF {
l.logger.Printf("hit end of file while reading compressed wal entry from %s", fileName)
return nil
} else if err != nil {
return err
}
data, err = snappy.Decode(data, buf[0:length])
if err != nil {
l.logger.Printf("error decoding compressed entry from %s: %s", fileName, err.Error())
return nil
}
// and marshal it and send it to the cache
switch walEntryType(entryType) {
case pointsEntry:
points, err := models.ParsePoints(data)
if err != nil {
return err
}
l.addToCache(points, nil, nil, false)
case fieldsEntry:
fields := make(map[string]*tsdb.MeasurementFields)
if err := json.Unmarshal(data, &fields); err != nil {
return err
}
l.addToCache(nil, fields, nil, false)
case seriesEntry:
series := make([]*tsdb.SeriesCreate, 0)
if err := json.Unmarshal(data, &series); err != nil {
return err
}
l.addToCache(nil, nil, series, false)
case deleteEntry:
d := &deleteData{}
if err := json.Unmarshal(data, &d); err != nil {
return err
}
l.Index.MarkDeletes(d.Keys)
l.Index.MarkMeasurementDelete(d.MeasurementName)
l.deleteKeysFromCache(d.Keys)
if d.MeasurementName != "" {
l.deleteMeasurementFromCache(d.MeasurementName)
}
}
}
}
func (l *Log) writeToLog(writeType walEntryType, data []byte) error {
l.writeLock.Lock()
defer l.writeLock.Unlock()
if l.currentSegmentFile == nil || l.currentSegmentSize > DefaultSegmentSize {
if err := l.newSegmentFile(); err != nil {
// fail hard since we can't write data
panic(fmt.Sprintf("error opening new segment file for wal: %s", err.Error()))
}
}
// The panics here are an intentional choice. Based on reports from users
// it's better to fail hard if the database can't take writes. Then they'll
// get alerted and fix whatever is broken. Remove these and face Paul's wrath.
if _, err := l.currentSegmentFile.Write([]byte{byte(writeType)}); err != nil {
panic(fmt.Sprintf("error writing type to wal: %s", err.Error()))
}
if _, err := l.currentSegmentFile.Write(u32tob(uint32(len(data)))); err != nil {
panic(fmt.Sprintf("error writing len to wal: %s", err.Error()))
}
if _, err := l.currentSegmentFile.Write(data); err != nil {
panic(fmt.Sprintf("error writing data to wal: %s", err.Error()))
}
l.currentSegmentSize += 5 + len(data)
return l.currentSegmentFile.Sync()
}
// Flush will force a flush of the WAL to the index
func (l *Log) Flush() error {
return l.flush(idleFlush)
}
func (l *Log) DeleteMeasurement(measurement string, keys []string) error {
d := &deleteData{MeasurementName: measurement, Keys: keys}
err := l.writeDeleteEntry(d)
if err != nil {
return err
}
l.deleteKeysFromCache(keys)
l.deleteMeasurementFromCache(measurement)
return nil
}
func (l *Log) deleteMeasurementFromCache(name string) {
l.cacheLock.Lock()
defer l.cacheLock.Unlock()
delete(l.measurementFieldsCache, name)
}
func (l *Log) writeDeleteEntry(d *deleteData) error {
js, err := json.Marshal(d)
if err != nil {
return err
}
data := snappy.Encode(nil, js)
return l.writeToLog(deleteEntry, data)
}
func (l *Log) DeleteSeries(keys []string) error {
l.deleteKeysFromCache(keys)
return l.writeDeleteEntry(&deleteData{Keys: keys})
}
func (l *Log) deleteKeysFromCache(keys []string) {
seriesKeys := make(map[string]bool)
for _, k := range keys {
series, _ := seriesAndFieldFromCompositeKey(k)
seriesKeys[series] = true
}
l.cacheLock.Lock()
defer l.cacheLock.Unlock()
for _, k := range keys {
delete(l.cache, k)
}
// now remove any of these that are marked for creation
var seriesCreate []*tsdb.SeriesCreate
for _, sc := range l.seriesToCreateCache {
if _, ok := seriesKeys[sc.Series.Key]; !ok {
seriesCreate = append(seriesCreate, sc)
}
}
l.seriesToCreateCache = seriesCreate
}
// Close will finish any flush that is currently in process and close file handles
func (l *Log) Close() error {
l.writeLock.Lock()
l.cacheLock.Lock()
defer l.writeLock.Unlock()
defer l.cacheLock.Unlock()
l.cache = nil
l.measurementFieldsCache = nil
l.seriesToCreateCache = nil
if l.currentSegmentFile == nil {
return nil
}
if err := l.currentSegmentFile.Close(); err != nil {
return err
}
l.currentSegmentFile = nil
return nil
}
// close all the open Log partitions and file handles
func (l *Log) close() error {
l.cache = nil
l.cacheDirtySort = nil
if l.currentSegmentFile == nil {
return nil
}
if err := l.currentSegmentFile.Close(); err != nil {
return err
}
l.currentSegmentFile = nil
return nil
}
// flush writes all wal data in memory to the index
func (l *Log) flush(flush flushType) error {
// only flush if there isn't one already running. Memory flushes are only triggered
// by writes, which will mark the flush as running, so we can ignore it.
l.cacheLock.Lock()
if l.flushRunning && flush != memoryFlush {
l.cacheLock.Unlock()
return nil
}
// mark the flush as running and ensure that it gets marked as not running when we return
l.flushRunning = true
defer func() {
l.cacheLock.Lock()
l.flushRunning = false
l.cacheLock.Unlock()
}()
// only hold the lock while we rotate the segment file
l.writeLock.Lock()
lastFileID := l.currentSegmentID
// if it's an idle flush, don't open a new segment file
if flush == idleFlush {
if l.currentSegmentFile != nil {
if err := l.currentSegmentFile.Close(); err != nil {
return err
}
l.currentSegmentFile = nil
l.currentSegmentSize = 0
}
} else {
if err := l.newSegmentFile(); err != nil {
// there's no recovering from this, fail hard
panic(fmt.Sprintf("error creating new wal file: %s", err.Error()))
}
}
l.writeLock.Unlock()
// copy the cache items to new maps so we can empty them out
l.flushCache = make(map[string]Values)
valueCount := 0
for key, v := range l.cache {
l.flushCache[key] = v
valueCount += len(v)
}
l.cache = make(map[string]Values)
for k, _ := range l.cacheDirtySort {
l.flushCache[k] = l.flushCache[k].Deduplicate()
}
l.cacheDirtySort = make(map[string]bool)
flushSize := l.memorySize
// reset the memory being used by the cache
l.memorySize = 0
// reset the measurements for flushing
mfc := l.measurementFieldsCache
l.measurementFieldsCache = make(map[string]*tsdb.MeasurementFields)
// reset the series for flushing
scc := l.seriesToCreateCache
l.seriesToCreateCache = nil
l.cacheLock.Unlock()
// exit if there's nothing to flush to the index
if len(l.flushCache) == 0 && len(mfc) == 0 && len(scc) == 0 && flush != startupFlush {
return nil
}
if l.LoggingEnabled {
ftype := "idle"
if flush == memoryFlush {
ftype = "memory"
} else if flush == startupFlush {
ftype = "startup"
}
l.logger.Printf("%s flush of %s with %d keys and %d total values of %d bytes\n", ftype, l.path, len(l.flushCache), valueCount, flushSize)
}
startTime := time.Now()
if err := l.Index.Write(l.flushCache, mfc, scc); err != nil {
return err
}
if l.LoggingEnabled {
l.logger.Printf("%s flush to index took %s\n", l.path, time.Since(startTime))
}
l.cacheLock.Lock()
l.flushCache = nil
l.cacheLock.Unlock()
// remove all the old segment files
fileNames, err := l.segmentFileNames()
if err != nil {
return err
}
for _, fn := range fileNames {
id, err := idFromFileName(fn)
if err != nil {
return err
}
if id <= lastFileID {
err := os.Remove(fn)
if err != nil {
return err
}
}
}
return nil
}
// segmentFileNames will return all files that are WAL segment files in sorted order by ascending ID
func (l *Log) segmentFileNames() ([]string, error) {
names, err := filepath.Glob(filepath.Join(l.path, fmt.Sprintf("%s*.%s", WALFilePrefix, WALFileExtension)))
if err != nil {
return nil, err
}
sort.Strings(names)
return names, nil
}
// newSegmentFile will close the current segment file and open a new one, updating bookkeeping info on the log
func (l *Log) newSegmentFile() error {
l.currentSegmentID += 1
if l.currentSegmentFile != nil {
if err := l.currentSegmentFile.Close(); err != nil {
return err
}
}
fileName := filepath.Join(l.path, fmt.Sprintf("%s%05d.%s", WALFilePrefix, l.currentSegmentID, WALFileExtension))
ff, err := os.OpenFile(fileName, os.O_CREATE|os.O_RDWR, 0666)
if err != nil {
return err
}
l.currentSegmentSize = 0
l.currentSegmentFile = ff
return nil
}
// shouldFlush will return the flushType specifying whether we should flush. memoryFlush
// is never returned from this function since those can only be triggered by writes
func (l *Log) shouldFlush() flushType {
l.cacheLock.RLock()
defer l.cacheLock.RUnlock()
if l.flushRunning {
return noFlush
}
if len(l.cache) == 0 {
return noFlush
}
if time.Since(l.lastWriteTime) > l.FlushColdInterval {
return idleFlush
}
return noFlush
}
// cursor is a unidirectional iterator for a given entry in the cache
type walCursor struct {
cache Values
position int
ascending bool
}
func newWALCursor(cache Values, ascending bool) *walCursor {
// position is set such that a call to Next will successfully advance
// to the next postion and return the value.
c := &walCursor{cache: cache, ascending: ascending, position: -1}
if !ascending {
c.position = len(c.cache)
}
return c
}
func (c *walCursor) Ascending() bool { return c.ascending }
// Seek will point the cursor to the given time (or key)
func (c *walCursor) SeekTo(seek int64) (int64, interface{}) {
// Seek cache index
c.position = sort.Search(len(c.cache), func(i int) bool {
return c.cache[i].Time().UnixNano() >= seek
})
// If seek is not in the cache, return the last value in the cache
if !c.ascending && c.position >= len(c.cache) {
c.position = len(c.cache) - 1
}
// Make sure our position points to something in the cache
if c.position < 0 || c.position >= len(c.cache) {
return tsdb.EOF, nil
}
v := c.cache[c.position]
return v.Time().UnixNano(), v.Value()
}
// Next moves the cursor to the next key/value. will return nil if at the end
func (c *walCursor) Next() (int64, interface{}) {
var v Value
if c.ascending {
v = c.nextForward()
} else {
v = c.nextReverse()
}
return v.Time().UnixNano(), v.Value()
}
// nextForward advances the cursor forward returning the next value
func (c *walCursor) nextForward() Value {
c.position++
if c.position >= len(c.cache) {
return &EmptyValue{}
}
return c.cache[c.position]
}
// nextReverse advances the cursor backwards returning the next value
func (c *walCursor) nextReverse() Value {
c.position--
if c.position < 0 {
return &EmptyValue{}
}
return c.cache[c.position]
}
// deleteData holds the information for a delete entry
type deleteData struct {
// MeasurementName will be empty for deletes that are only against series
MeasurementName string
Keys []string
}
// idFromFileName parses the segment file ID from its name
func idFromFileName(name string) (int, error) {
parts := strings.Split(filepath.Base(name), ".")
if len(parts) != 2 {
return 0, fmt.Errorf("file %s has wrong name format to have an id", name)
}
id, err := strconv.ParseUint(parts[0][1:], 10, 32)
return int(id), err
}