2034 lines
48 KiB
Go
2034 lines
48 KiB
Go
package raft
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"log"
|
|
"math/rand"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"runtime/debug"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// FSM represents the state machine that the log is applied to.
|
|
// The FSM must maintain the highest index that it has seen.
|
|
type FSM interface {
|
|
// These implement the snapshot and restore.
|
|
io.WriterTo
|
|
io.ReaderFrom
|
|
|
|
// Executes a log entry against the state machine.
|
|
// Non-repeatable errors such as system and disk errors must panic.
|
|
Apply(*LogEntry) error
|
|
|
|
// Returns the applied index saved to the state machine.
|
|
Index() uint64
|
|
}
|
|
|
|
const logEntryHeaderSize = 8 + 8 + 8 // sz+index+term
|
|
|
|
// WaitInterval represents the amount of time between checks to the applied index.
|
|
// This is used by clients wanting to wait until a given index is processed.
|
|
const WaitInterval = 1 * time.Millisecond
|
|
|
|
// State represents whether the log is a follower, candidate, or leader.
|
|
type State int
|
|
|
|
// String returns the string representation of the state.
|
|
func (s State) String() string {
|
|
switch s {
|
|
case Stopped:
|
|
return "stopped"
|
|
case Follower:
|
|
return "follower"
|
|
case Candidate:
|
|
return "candidate"
|
|
case Leader:
|
|
return "leader"
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
const (
|
|
Stopped State = iota
|
|
Follower
|
|
Candidate
|
|
Leader
|
|
)
|
|
|
|
const (
|
|
// DefaultLogEntryCacheSize is the default number of entries to keep before trimming.
|
|
DefaultLogEntryCacheSize = 1000
|
|
)
|
|
|
|
// Log represents a replicated log of commands based on the Raft protocol.
|
|
//
|
|
// The log can exist in one of four states that transition based on the following rules:
|
|
//
|
|
// ┌───────────┐
|
|
// ┌─▶│ Stopped │
|
|
// │ └───────────┘
|
|
// │ │
|
|
// │ ▼
|
|
// │ ┌───────────┐
|
|
// ├──│ Follower │◀─┐
|
|
// │ └───────────┘ │
|
|
// close │ │ │
|
|
// log │ ▼ │
|
|
// │ ┌───────────┐ │
|
|
// ├──│ Candidate │──┤ higher
|
|
// │ └───────────┘ │ term
|
|
// │ │ │
|
|
// │ ▼ │
|
|
// │ ┌───────────┐ │
|
|
// └──│ Leader │──┘
|
|
// └───────────┘
|
|
//
|
|
// - Stopped moves to Follower when initialized or joined.
|
|
// - Follower moves to Candidate after election timeout.
|
|
// - Candidate moves to Leader after a quorum of votes.
|
|
// - Leader or Candidate moves to Follower if higher term seen.
|
|
// - Any state moves to Stopped if log is closed.
|
|
type Log struct {
|
|
mu sync.Mutex
|
|
|
|
// The directory where the id, term and config are written to.
|
|
path string
|
|
|
|
// The log identifier. This is set when the log initializes
|
|
// or when the log joins to another cluster.
|
|
id uint64
|
|
|
|
// Config stores all nodes in the cluster.
|
|
config *Config
|
|
|
|
// The ID of the current leader.
|
|
leaderID uint64
|
|
|
|
// Current state of the log.
|
|
// The transitioning channel is closed whenever state is changed.
|
|
state State
|
|
transitioning chan struct{}
|
|
|
|
// An atomic flag stating if a snapshot is currently being loaded.
|
|
snapshotting uint32
|
|
|
|
// In-memory log entries.
|
|
// Followers replicate these entries from the Leader.
|
|
// Leader appends to the end of these entries.
|
|
// Truncated and trimmed as needed.
|
|
entries []*LogEntry
|
|
|
|
// Highest term & index in the log.
|
|
// These are initialially read from the id/term files but otherwise
|
|
// should always match the index/term of the last 'entries' element.
|
|
lastLogTerm uint64
|
|
lastLogIndex uint64
|
|
|
|
// Highest entry to be committed.
|
|
// An entry can be committed once a quorum of nodes have received the entry.
|
|
// Because streaming raft asyncronously replicates entries, the lastLogIndex
|
|
// may be lower than the commitIndex. The commitIndex is always higher than
|
|
// or equal to the FSM.Index().
|
|
commitIndex uint64
|
|
|
|
// The current term the log is in. This increases when the log starts a
|
|
// new election term or when the log sees a higher election term.
|
|
term uint64
|
|
|
|
// The node this log voted for in the current term.
|
|
votedFor uint64
|
|
|
|
// Incoming stream from the leader.
|
|
// This is disconnected when the leader is deposed or the log changes state.
|
|
reader io.ReadCloser
|
|
|
|
// Outgoing streams to the followers to replicate the log.
|
|
// These are closed when the leader is deposed.
|
|
writers []*logWriter // outgoing streams to followers
|
|
|
|
// Incoming heartbeats and term changes go to these channels
|
|
// and are picked up by the current state.
|
|
heartbeats chan heartbeat
|
|
terms chan struct{}
|
|
|
|
// Close notification and wait.
|
|
wg sync.WaitGroup
|
|
closing chan struct{}
|
|
|
|
// Network address to the reach the log.
|
|
url url.URL
|
|
|
|
// The state machine that log entries will be applied to.
|
|
FSM FSM
|
|
|
|
// LogEntryCacheSize is the minimum number of log entries to keep before
|
|
// trimming the log. These entries are kept in case a node disconnects
|
|
// momentarily. Otherwise a reconnecting node would have to resnapshot.
|
|
LogEntryCacheSize int
|
|
|
|
// The transport used to communicate with other nodes in the cluster.
|
|
Transport interface {
|
|
Join(u url.URL, nodeURL url.URL) (id uint64, leaderID uint64, config *Config, err error)
|
|
Leave(u url.URL, id uint64) error
|
|
Heartbeat(u url.URL, term, commitIndex, leaderID uint64) (lastIndex uint64, err error)
|
|
ReadFrom(u url.URL, id, term, index uint64) (io.ReadCloser, error)
|
|
RequestVote(u url.URL, term, candidateID, lastLogIndex, lastLogTerm uint64) (peerTerm uint64, err error)
|
|
}
|
|
|
|
// Clock is an abstraction of time.
|
|
Clock interface {
|
|
Now() time.Time
|
|
AfterApplyInterval() <-chan chan struct{}
|
|
AfterElectionTimeout() <-chan chan struct{}
|
|
AfterHeartbeatInterval() <-chan chan struct{}
|
|
AfterReconnectTimeout() <-chan chan struct{}
|
|
}
|
|
|
|
// Rand returns a random number.
|
|
Rand func() int64
|
|
|
|
// Sets whether trace messages are logged.
|
|
DebugEnabled bool
|
|
|
|
// This logs some asynchronous errors that occur within the log.
|
|
Logger *log.Logger
|
|
}
|
|
|
|
// NewLog creates a new instance of Log with reasonable defaults.
|
|
func NewLog() *Log {
|
|
l := &Log{
|
|
Clock: NewClock(),
|
|
Transport: &HTTPTransport{},
|
|
Rand: rand.NewSource(time.Now().UnixNano()).Int63,
|
|
heartbeats: make(chan heartbeat, 10),
|
|
terms: make(chan struct{}, 1),
|
|
Logger: log.New(os.Stderr, "[raft] ", log.LstdFlags),
|
|
|
|
LogEntryCacheSize: DefaultLogEntryCacheSize,
|
|
}
|
|
l.Logger.SetPrefix("[raft] ")
|
|
return l
|
|
}
|
|
|
|
func (l *Log) lock() { l.mu.Lock() }
|
|
func (l *Log) unlock() { l.mu.Unlock() }
|
|
|
|
func (l *Log) printCaller(label string) {
|
|
_, file, line, _ := runtime.Caller(2)
|
|
l.Logger.Printf("%s: %s:%d", label, filepath.Base(file), line)
|
|
}
|
|
|
|
// Path returns the data path of the Raft log.
|
|
// Returns an empty string if the log is closed.
|
|
func (l *Log) Path() string {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.path
|
|
}
|
|
|
|
// URL returns the URL for the log.
|
|
func (l *Log) URL() url.URL {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.url
|
|
}
|
|
|
|
// SetURL sets the URL for the log. This must be set before opening.
|
|
func (l *Log) SetURL(u url.URL) {
|
|
l.lock()
|
|
defer l.unlock()
|
|
assert(!l.opened(), "url cannot be set while log is open")
|
|
l.url = u
|
|
}
|
|
|
|
// URLs returns a list of all URLs in the cluster.
|
|
func (l *Log) URLs() []url.URL {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
if l.config == nil {
|
|
return nil
|
|
}
|
|
|
|
var a []url.URL
|
|
for _, n := range l.config.Nodes {
|
|
a = append(a, n.URL)
|
|
}
|
|
|
|
return a
|
|
}
|
|
|
|
func (l *Log) idPath() string { return filepath.Join(l.path, "id") }
|
|
func (l *Log) termPath() string { return filepath.Join(l.path, "term") }
|
|
func (l *Log) configPath() string { return filepath.Join(l.path, "config") }
|
|
|
|
// Opened returns true if the log is currently open.
|
|
func (l *Log) Opened() bool {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.opened()
|
|
}
|
|
|
|
func (l *Log) opened() bool { return l.path != "" }
|
|
|
|
// ID returns the log's identifier.
|
|
func (l *Log) ID() uint64 {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.id
|
|
}
|
|
|
|
// State returns the current state.
|
|
func (l *Log) State() State {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.state
|
|
}
|
|
|
|
// isSnapshotting returns true if the log is currently restoring from snapshot.
|
|
func (l *Log) isSnapshotting() bool { return atomic.LoadUint32(&l.snapshotting) != 0 }
|
|
|
|
// LastLogIndexTerm returns the last index & term from the log.
|
|
func (l *Log) LastLogIndexTerm() (index, term uint64) {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.lastLogIndex, l.lastLogTerm
|
|
}
|
|
|
|
// CommtIndex returns the highest committed index.
|
|
func (l *Log) CommitIndex() uint64 {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.commitIndex
|
|
}
|
|
|
|
// Term returns the current term.
|
|
func (l *Log) Term() uint64 {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.term
|
|
}
|
|
|
|
// Config returns a the log's current configuration.
|
|
func (l *Log) Config() *Config {
|
|
l.lock()
|
|
defer l.unlock()
|
|
if l.config != nil {
|
|
return l.config.Clone()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Open initializes the log from a path.
|
|
// If the path does not exist then it is created.
|
|
func (l *Log) Open(path string) error {
|
|
var closing chan struct{}
|
|
var config *Config
|
|
if err := func() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Validate initial log state.
|
|
if l.opened() {
|
|
return ErrOpen
|
|
}
|
|
|
|
// Create directory, if not exists.
|
|
if err := os.MkdirAll(path, 0755); err != nil {
|
|
return err
|
|
}
|
|
l.path = path
|
|
|
|
// Initialize log identifier.
|
|
id, err := l.readID()
|
|
if err != nil {
|
|
return fmt.Errorf("read id: %s", err)
|
|
}
|
|
l.setID(id)
|
|
|
|
// Initialize log term.
|
|
term, err := l.readTerm()
|
|
if err != nil {
|
|
return fmt.Errorf("read term: %s", err)
|
|
}
|
|
l.term = term
|
|
l.votedFor = 0
|
|
l.lastLogTerm = term
|
|
|
|
// Read config.
|
|
c, err := l.readConfig()
|
|
if err != nil {
|
|
return fmt.Errorf("read config: %s", err)
|
|
}
|
|
l.config = c
|
|
|
|
// Determine last applied index from FSM.
|
|
index := l.FSM.Index()
|
|
l.tracef("Open: fsm: index=%d", index)
|
|
l.lastLogIndex = index
|
|
l.commitIndex = index
|
|
|
|
// Start goroutine to apply logs.
|
|
l.wg.Add(1)
|
|
l.closing = make(chan struct{})
|
|
go l.applier(l.closing)
|
|
|
|
if l.config != nil {
|
|
l.Logger.Printf("log open: created at %s, with ID %d, term %d, last applied index of %d", path, l.id, l.term, l.lastLogIndex)
|
|
}
|
|
|
|
// Retrieve variables to use while starting state loop.
|
|
config = l.config
|
|
closing = l.closing
|
|
|
|
return nil
|
|
}(); err == ErrOpen {
|
|
return err
|
|
} else if err != nil {
|
|
_ = l.close()
|
|
return err
|
|
}
|
|
|
|
// If a log exists then start the state loop.
|
|
if config != nil {
|
|
// If the config only has one node then start it as the leader.
|
|
// Otherwise start as a follower.
|
|
if len(config.Nodes) == 1 && config.Nodes[0].ID == l.ID() {
|
|
l.Logger.Println("log open: promoting to leader immediately")
|
|
l.startStateLoop(closing, Leader)
|
|
} else {
|
|
l.startStateLoop(closing, Follower)
|
|
}
|
|
} else {
|
|
l.Logger.Printf("log pending: waiting for initialization or join")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Close closes the log.
|
|
func (l *Log) Close() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.close()
|
|
}
|
|
|
|
// close should be called under lock.
|
|
func (l *Log) close() error {
|
|
l.tracef("closing...")
|
|
|
|
// Remove the reader.
|
|
_ = l.setReader(nil)
|
|
|
|
// Notify goroutines of closing and wait outside of lock.
|
|
if l.closing != nil {
|
|
close(l.closing)
|
|
l.closing = nil
|
|
l.unlock()
|
|
l.wg.Wait()
|
|
l.lock()
|
|
}
|
|
|
|
// Close the writers.
|
|
for _, w := range l.writers {
|
|
_ = w.Close()
|
|
}
|
|
l.writers = nil
|
|
|
|
// Clear log info.
|
|
l.setID(0)
|
|
l.path = ""
|
|
l.lastLogIndex, l.lastLogTerm = 0, 0
|
|
l.term, l.votedFor = 0, 0
|
|
l.config = nil
|
|
|
|
l.tracef("closed")
|
|
|
|
return nil
|
|
}
|
|
|
|
func (l *Log) setReaderWithLock(r io.ReadCloser) error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.setReader(r)
|
|
}
|
|
|
|
func (l *Log) setReader(r io.ReadCloser) error {
|
|
if l.reader != nil {
|
|
_ = l.reader.Close()
|
|
l.reader = nil
|
|
}
|
|
|
|
// Ignore if there is no new reader.
|
|
if r == nil {
|
|
return nil
|
|
}
|
|
|
|
// Close reader immediately and ignore if log is closed.
|
|
if !l.opened() {
|
|
_ = r.Close()
|
|
return ErrClosed
|
|
}
|
|
|
|
// Ignore if setting while transitioning state.
|
|
select {
|
|
case <-l.transitioning:
|
|
return errTransitioning
|
|
default:
|
|
}
|
|
|
|
// Set new reader.
|
|
l.reader = r
|
|
return nil
|
|
}
|
|
|
|
func (l *Log) setID(id uint64) { l.id = id }
|
|
|
|
// readID reads the log identifier from file.
|
|
func (l *Log) readID() (uint64, error) {
|
|
// Read identifier from disk.
|
|
b, err := ioutil.ReadFile(l.idPath())
|
|
if os.IsNotExist(err) {
|
|
return 0, nil
|
|
} else if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
// Parse identifier.
|
|
id, err := strconv.ParseUint(string(b), 10, 64)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return id, nil
|
|
}
|
|
|
|
// writeID writes the log identifier to file.
|
|
func (l *Log) writeID(id uint64) error {
|
|
b := []byte(strconv.FormatUint(id, 10))
|
|
return ioutil.WriteFile(l.idPath(), b, 0666)
|
|
}
|
|
|
|
// readTerm reads the log term from file.
|
|
func (l *Log) readTerm() (uint64, error) {
|
|
// Read term from disk.
|
|
b, err := ioutil.ReadFile(l.termPath())
|
|
if os.IsNotExist(err) {
|
|
return 0, nil
|
|
} else if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
// Parse term.
|
|
id, err := strconv.ParseUint(string(b), 10, 64)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return id, nil
|
|
}
|
|
|
|
// writeTerm writes the current log term to file.
|
|
func (l *Log) writeTerm(term uint64) error {
|
|
b := []byte(strconv.FormatUint(term, 10))
|
|
return ioutil.WriteFile(l.termPath(), b, 0666)
|
|
}
|
|
|
|
// setTerm sets the current term and clears the vote.
|
|
func (l *Log) setTerm(term uint64) error {
|
|
l.Logger.Printf("changing term: %d => %d", l.term, term)
|
|
|
|
if err := l.writeTerm(term); err != nil {
|
|
return err
|
|
}
|
|
|
|
l.term = term
|
|
l.votedFor = 0
|
|
return nil
|
|
}
|
|
|
|
// mustSetTerm sets the current term and clears the vote. Panic on error.
|
|
func (l *Log) mustSetTermIfHigher(term uint64) {
|
|
if term <= l.term {
|
|
return
|
|
}
|
|
|
|
if err := l.setTerm(term); err != nil {
|
|
panic("unable to set term: " + err.Error())
|
|
}
|
|
|
|
// Signal term change.
|
|
select {
|
|
case l.terms <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// readConfig reads the configuration from disk.
|
|
func (l *Log) readConfig() (*Config, error) {
|
|
// Read config from disk.
|
|
f, err := os.Open(l.configPath())
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
|
|
// Marshal file to a config type.
|
|
config := &Config{}
|
|
if err := NewConfigDecoder(f).Decode(config); err != nil {
|
|
return nil, err
|
|
}
|
|
return config, nil
|
|
}
|
|
|
|
// writeConfig writes the configuration to disk.
|
|
func (l *Log) writeConfig(config *Config) error {
|
|
// FIX(benbjohnson): Atomic write.
|
|
|
|
// Open file.
|
|
f, err := os.Create(l.configPath())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = f.Close() }()
|
|
|
|
// Marshal config into file.
|
|
if err := NewConfigEncoder(f).Encode(config); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Initialize a new log.
|
|
// Returns an error if log data already exists.
|
|
func (l *Log) Initialize() error {
|
|
var config *Config
|
|
if err := func() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Return error if log is not open or is already a member of a cluster.
|
|
if !l.opened() {
|
|
return ErrClosed
|
|
} else if l.id != 0 {
|
|
return ErrInitialized
|
|
}
|
|
|
|
// Start first node at id 1.
|
|
id := uint64(1)
|
|
|
|
// Generate a new configuration with one node.
|
|
config = &Config{MaxNodeID: id}
|
|
config.AddNode(id, l.url)
|
|
|
|
// Generate new 8-hex digit cluster identifier.
|
|
config.ClusterID = uint64(l.Rand())
|
|
|
|
// Generate log id.
|
|
if err := l.writeID(id); err != nil {
|
|
return err
|
|
}
|
|
l.setID(id)
|
|
|
|
// Automatically promote to leader.
|
|
term := uint64(1)
|
|
if err := l.setTerm(term); err != nil {
|
|
return fmt.Errorf("set term: %s", err)
|
|
}
|
|
l.lastLogTerm = term
|
|
l.leaderID = l.id
|
|
|
|
return nil
|
|
}(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Begin state loop as leader.
|
|
l.startStateLoop(l.closing, Leader)
|
|
|
|
l.Logger.Printf("log initialize: promoted to 'leader' with cluster ID %d, log ID %d, term %d",
|
|
config.ClusterID, l.id, l.term)
|
|
|
|
// Set initial configuration.
|
|
var buf bytes.Buffer
|
|
_ = NewConfigEncoder(&buf).Encode(config)
|
|
index, err := l.internalApply(LogEntryInitialize, buf.Bytes())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Wait until entry is applied.
|
|
return l.Wait(index)
|
|
}
|
|
|
|
// trace writes a log message if DebugEnabled is true.
|
|
func (l *Log) trace(v ...interface{}) {
|
|
if l.DebugEnabled {
|
|
l.Logger.Print(v...)
|
|
}
|
|
}
|
|
|
|
// trace writes a formatted log message if DebugEnabled is true.
|
|
func (l *Log) tracef(msg string, v ...interface{}) {
|
|
if l.DebugEnabled {
|
|
l.Logger.Printf(msg+"\n", v...)
|
|
}
|
|
}
|
|
|
|
// IsLeader returns true if the log is the current leader.
|
|
func (l *Log) IsLeader() bool {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.id != 0 && l.id == l.leaderID
|
|
}
|
|
|
|
// Leader returns the id and URL associated with the current leader.
|
|
// Returns zero if there is no current leader.
|
|
func (l *Log) Leader() (id uint64, u url.URL) {
|
|
l.lock()
|
|
defer l.unlock()
|
|
return l.leader()
|
|
}
|
|
|
|
func (l *Log) leader() (id uint64, u url.URL) {
|
|
// Ignore if there's no configuration set.
|
|
if l.config == nil {
|
|
return
|
|
}
|
|
|
|
// Find node by identifier.
|
|
n := l.config.NodeByID(l.leaderID)
|
|
if n == nil {
|
|
return
|
|
}
|
|
|
|
return n.ID, n.URL
|
|
}
|
|
|
|
// ClusterID returns the identifier for the cluster.
|
|
// Returns zero if the cluster has not been initialized yet.
|
|
func (l *Log) ClusterID() uint64 {
|
|
l.lock()
|
|
defer l.unlock()
|
|
if l.config == nil {
|
|
return 0
|
|
}
|
|
return l.config.ClusterID
|
|
}
|
|
|
|
// Join contacts a node in the cluster to request membership.
|
|
// A log cannot join a cluster if it has already been initialized.
|
|
func (l *Log) Join(u url.URL) error {
|
|
// Validate under lock.
|
|
var nodeURL url.URL
|
|
if err := func() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
if !l.opened() {
|
|
return ErrClosed
|
|
} else if l.id != 0 {
|
|
return ErrInitialized
|
|
} else if l.url.Host == "" {
|
|
return ErrURLRequired
|
|
}
|
|
|
|
nodeURL = l.url
|
|
return nil
|
|
}(); err != nil {
|
|
return err
|
|
}
|
|
|
|
l.tracef("Join: %s", u)
|
|
|
|
// Send join request.
|
|
id, leaderID, config, err := l.Transport.Join(u, nodeURL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
l.leaderID = leaderID
|
|
|
|
l.tracef("Join: confirmed")
|
|
|
|
// Lock once the join request is returned.
|
|
if err := func() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Write identifier.
|
|
if err := l.writeID(id); err != nil {
|
|
return err
|
|
}
|
|
l.setID(id)
|
|
|
|
// Write config.
|
|
if err := l.writeConfig(config); err != nil {
|
|
return err
|
|
}
|
|
l.config = config
|
|
|
|
return nil
|
|
}(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Begin state loop as follower.
|
|
l.startStateLoop(l.closing, Follower)
|
|
|
|
// Change to a follower state.
|
|
l.Logger.Println("log join: entered 'follower' state for cluster at", u, " with log ID", l.id)
|
|
|
|
// Wait for anything to be applied.
|
|
return l.Wait(1)
|
|
}
|
|
|
|
// Leave removes the log from cluster membership and removes the log data.
|
|
func (l *Log) Leave() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// TODO(benbjohnson): Check if open.
|
|
// TODO(benbjohnson): Apply remove peer command.
|
|
// TODO(benbjohnson): Remove underlying data.
|
|
|
|
return nil
|
|
}
|
|
|
|
// startStateLoop begins the state loop in a separate goroutine.
|
|
// Returns once the state has transitioned to the initial state passed in.
|
|
func (l *Log) startStateLoop(closing <-chan struct{}, state State) {
|
|
l.wg.Add(1)
|
|
stateChanged := make(chan struct{})
|
|
go l.stateLoop(closing, state, stateChanged)
|
|
|
|
// Wait until state change.
|
|
<-stateChanged
|
|
}
|
|
|
|
// stateLoop runs in a separate goroutine and runs the appropriate state loop.
|
|
func (l *Log) stateLoop(closing <-chan struct{}, state State, stateChanged chan struct{}) {
|
|
defer l.wg.Done()
|
|
|
|
for {
|
|
// Transition to new state.
|
|
var transitioning chan struct{}
|
|
func() {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
l.Logger.Printf("log state change: %s => %s (term=%d)", l.state, state, l.term)
|
|
l.state = state
|
|
l.transitioning = make(chan struct{}, 0)
|
|
transitioning = l.transitioning
|
|
|
|
// Remove previous reader, if one exists.
|
|
_ = l.setReader(nil)
|
|
}()
|
|
|
|
// Notify caller on first state changes.
|
|
if stateChanged != nil {
|
|
close(stateChanged)
|
|
stateChanged = nil
|
|
}
|
|
|
|
// Execute the appropriate state loop.
|
|
// Each loop returns the next state to transition to.
|
|
switch state {
|
|
case Stopped:
|
|
return
|
|
case Follower:
|
|
state = l.followerLoop(closing)
|
|
case Candidate:
|
|
state = l.candidateLoop(closing)
|
|
case Leader:
|
|
state = l.leaderLoop(closing)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *Log) followerLoop(closing <-chan struct{}) State {
|
|
l.tracef("followerLoop")
|
|
defer l.tracef("followerLoop: exit")
|
|
|
|
// Ensure all follower goroutines complete before transitioning to another state.
|
|
var wg sync.WaitGroup
|
|
defer wg.Wait()
|
|
defer l.setReaderWithLock(nil)
|
|
defer close(l.transitioning)
|
|
|
|
// Read log from leader in a separate goroutine.
|
|
wg.Add(1)
|
|
go l.readFromLeader(&wg)
|
|
|
|
for {
|
|
select {
|
|
case <-closing:
|
|
return Stopped
|
|
case ch := <-l.Clock.AfterElectionTimeout():
|
|
close(ch)
|
|
|
|
// Ignore timeout if we are snapshotting.
|
|
// Or if we haven't received confirmation of join.
|
|
if l.isSnapshotting() {
|
|
continue
|
|
} else if l.FSM.Index() == 0 {
|
|
continue
|
|
}
|
|
|
|
// TODO: Prevote before becoming candidate.
|
|
|
|
return Candidate
|
|
case hb := <-l.heartbeats:
|
|
l.tracef("followerLoop: heartbeat: term=%d, idx=%d", hb.term, hb.commitIndex)
|
|
|
|
// Update term, commit index & leader.
|
|
l.lock()
|
|
l.mustSetTermIfHigher(hb.term)
|
|
if hb.commitIndex > l.commitIndex {
|
|
l.commitIndex = hb.commitIndex
|
|
}
|
|
l.leaderID = hb.leaderID
|
|
l.unlock()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *Log) readFromLeader(wg *sync.WaitGroup) {
|
|
defer wg.Done()
|
|
l.tracef("readFromLeader:")
|
|
|
|
for {
|
|
select {
|
|
case <-l.transitioning:
|
|
l.tracef("readFromLeader: exiting")
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Retrieve the term, last log index, & leader URL.
|
|
l.lock()
|
|
id, lastLogIndex, term := l.id, l.lastLogIndex, l.term
|
|
_, u := l.leader()
|
|
l.unlock()
|
|
|
|
// If no leader exists then wait momentarily and retry.
|
|
if u.Host == "" {
|
|
l.tracef("readFromLeader: no leader")
|
|
time.Sleep(500 * time.Millisecond)
|
|
continue
|
|
}
|
|
|
|
// Connect to leader.
|
|
l.tracef("readFromLeader: read from: %s, id=%d, term=%d, index=%d", u.String(), id, term, lastLogIndex)
|
|
r, err := l.Transport.ReadFrom(u, id, term, lastLogIndex)
|
|
if err != nil {
|
|
l.Logger.Printf("connect stream: %s", err)
|
|
time.Sleep(500 * time.Millisecond)
|
|
continue
|
|
}
|
|
|
|
// Attach the stream to the log.
|
|
if err := l.ReadFrom(r); err != nil {
|
|
l.tracef("readFromLeader: read from: disconnect: %s", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// truncateTo removes all uncommitted entries up to index.
|
|
func (l *Log) truncateTo(index uint64) {
|
|
assert(index >= l.commitIndex, "cannot truncate to before the commit index: index=%d, commit=%d", index, l.commitIndex)
|
|
|
|
// Ignore if there are no entries.
|
|
// Ignore if all entries are before the index.
|
|
if len(l.entries) == 0 {
|
|
return
|
|
} else if l.entries[len(l.entries)-1].Index < index {
|
|
return
|
|
}
|
|
|
|
// If all entries are after the index, remove all.
|
|
if l.entries[0].Index > index {
|
|
l.entries = nil
|
|
l.lastLogIndex, l.lastLogTerm = index, l.term
|
|
return
|
|
}
|
|
|
|
// Otherwise slice entries starting from index.
|
|
emin, emax := l.entries[0].Index, l.entries[len(l.entries)-1].Index
|
|
l.tracef("trunc: entries=[%d,%d], index=%d", emin, emax, index)
|
|
l.entries = l.entries[:index-emin+1]
|
|
l.lastLogIndex = index
|
|
|
|
assert(l.entries[len(l.entries)-1].Index == index, "last entry in truncation not index: emax=%d, index=%d", l.entries[len(l.entries)-1].Index, index)
|
|
}
|
|
|
|
// candidateLoop requests vote from other nodes in an attempt to become leader.
|
|
func (l *Log) candidateLoop(closing <-chan struct{}) State {
|
|
l.tracef("candidateLoop")
|
|
defer l.tracef("candidateLoop: exit")
|
|
|
|
// TODO: prevote
|
|
|
|
// Increment term and request votes.
|
|
l.lock()
|
|
l.mustSetTermIfHigher(l.term + 1)
|
|
l.votedFor = l.id
|
|
term := l.term
|
|
|
|
select {
|
|
case <-l.terms:
|
|
default:
|
|
}
|
|
l.unlock()
|
|
|
|
// Ensure all candidate goroutines complete before transitioning to another state.
|
|
var wg sync.WaitGroup
|
|
defer wg.Wait()
|
|
defer close(l.transitioning)
|
|
|
|
// Read log from leader in a separate goroutine.
|
|
wg.Add(1)
|
|
elected := make(chan struct{}, 1)
|
|
go l.elect(term, elected, &wg)
|
|
|
|
for {
|
|
select {
|
|
case <-closing:
|
|
return Stopped
|
|
case hb := <-l.heartbeats:
|
|
l.lock()
|
|
l.mustSetTermIfHigher(hb.term)
|
|
if hb.term >= l.term {
|
|
l.leaderID = hb.leaderID
|
|
}
|
|
l.unlock()
|
|
case <-l.terms:
|
|
return Follower
|
|
case <-elected:
|
|
return Leader
|
|
case ch := <-l.Clock.AfterElectionTimeout():
|
|
close(ch)
|
|
return Follower
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *Log) elect(term uint64, elected chan struct{}, wg *sync.WaitGroup) {
|
|
defer wg.Done()
|
|
|
|
// Ensure we are in the same term and copy properties.
|
|
l.lock()
|
|
if term != l.term {
|
|
l.unlock()
|
|
return
|
|
}
|
|
id, config := l.id, l.config
|
|
lastLogIndex, lastLogTerm := l.lastLogIndex, l.lastLogTerm
|
|
l.unlock()
|
|
|
|
// Request votes from peers.
|
|
votes := make(chan struct{}, len(config.Nodes))
|
|
for _, n := range config.Nodes {
|
|
if n.ID == id {
|
|
continue
|
|
}
|
|
go func(n *ConfigNode) {
|
|
peerTerm, err := l.Transport.RequestVote(n.URL, term, id, lastLogIndex, lastLogTerm)
|
|
l.Logger.Printf("send req vote(term=%d, candidateID=%d, lastLogIndex=%d, lastLogTerm=%d) (term=%d, err=%v)", term, id, lastLogIndex, lastLogTerm, peerTerm, err)
|
|
|
|
// If an error occured then update term.
|
|
if err != nil {
|
|
l.lock()
|
|
l.mustSetTermIfHigher(peerTerm)
|
|
l.unlock()
|
|
return
|
|
}
|
|
votes <- struct{}{}
|
|
}(n)
|
|
}
|
|
|
|
// Wait until we have a quorum before responding.
|
|
voteN := 1
|
|
for {
|
|
// Signal channel that the log has been elected.
|
|
if voteN >= (len(config.Nodes)/2)+1 {
|
|
elected <- struct{}{}
|
|
return
|
|
}
|
|
|
|
// Wait until log transitions to another state or we receive a vote.
|
|
select {
|
|
case <-l.transitioning:
|
|
return
|
|
case <-votes:
|
|
voteN++
|
|
}
|
|
}
|
|
}
|
|
|
|
// leaderLoop periodically sends heartbeats to all followers to maintain dominance.
|
|
func (l *Log) leaderLoop(closing <-chan struct{}) State {
|
|
l.tracef("leaderLoop")
|
|
defer l.tracef("leaderLoop: exit")
|
|
|
|
// Ensure all leader goroutines complete before transitioning to another state.
|
|
var wg sync.WaitGroup
|
|
defer wg.Wait()
|
|
defer close(l.transitioning)
|
|
|
|
// Retrieve leader's term.
|
|
l.lock()
|
|
term := l.term
|
|
|
|
select {
|
|
case <-l.terms:
|
|
default:
|
|
}
|
|
l.unlock()
|
|
|
|
// Read log from leader in a separate goroutine.
|
|
for {
|
|
// Send hearbeat to followers.
|
|
wg.Add(1)
|
|
committed := make(chan uint64, 1)
|
|
go l.heartbeater(term, committed, &wg)
|
|
|
|
// Wait for close, new leader, or new heartbeat response.
|
|
select {
|
|
case <-closing: // wait for state change.
|
|
return Stopped
|
|
|
|
case <-l.terms: // step down on higher term
|
|
l.lock()
|
|
l.truncateTo(l.commitIndex)
|
|
l.unlock()
|
|
return Follower
|
|
|
|
case hb := <-l.heartbeats: // update term, if necessary
|
|
l.lock()
|
|
l.mustSetTermIfHigher(hb.term)
|
|
l.unlock()
|
|
|
|
case commitIndex, ok := <-committed:
|
|
// Quorum not reached, try again.
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
// Quorum reached, set new commit index.
|
|
l.lock()
|
|
if commitIndex > l.commitIndex {
|
|
l.tracef("leaderLoop: committed: idx=%d", commitIndex)
|
|
l.commitIndex = commitIndex
|
|
}
|
|
l.unlock()
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// heartbeater continually sends heartbeats to all peers.
|
|
func (l *Log) heartbeater(term uint64, committed chan uint64, wg *sync.WaitGroup) {
|
|
defer wg.Done()
|
|
|
|
// Ensure term is correct and retrieve current state.
|
|
l.lock()
|
|
if l.term != term {
|
|
l.unlock()
|
|
return
|
|
}
|
|
commitIndex, localIndex, leaderID, config := l.commitIndex, l.lastLogIndex, l.id, l.config
|
|
l.unlock()
|
|
|
|
// Commit latest index if there are no peers.
|
|
if config == nil || len(config.Nodes) <= 1 {
|
|
time.Sleep(10 * time.Millisecond)
|
|
committed <- localIndex
|
|
return
|
|
}
|
|
|
|
l.tracef("send heartbeat: start: n=%d", len(config.Nodes))
|
|
|
|
// Send heartbeats to all peers.
|
|
peerIndices := make(chan uint64, len(config.Nodes))
|
|
for _, n := range config.Nodes {
|
|
if n.ID == leaderID {
|
|
continue
|
|
}
|
|
go func(n *ConfigNode) {
|
|
peerIndex, err := l.Transport.Heartbeat(n.URL, term, commitIndex, leaderID)
|
|
if err != nil {
|
|
l.Logger.Printf("send heartbeat: error: %s", err)
|
|
return
|
|
}
|
|
peerIndices <- peerIndex
|
|
}(n)
|
|
}
|
|
|
|
// Wait for heartbeat responses or timeout.
|
|
after := l.Clock.AfterHeartbeatInterval()
|
|
indexes := make([]uint64, 1, len(config.Nodes))
|
|
indexes[0] = localIndex
|
|
for {
|
|
select {
|
|
case <-l.transitioning:
|
|
l.tracef("send heartbeat: transitioning")
|
|
return
|
|
case peerIndex := <-peerIndices:
|
|
l.tracef("send heartbeat: index: idx=%d, idxs=%+v", peerIndex, indexes)
|
|
indexes = append(indexes, peerIndex) // collect responses
|
|
case ch := <-after:
|
|
// Once we have enough indices then return the lowest index
|
|
// among the highest quorum of nodes.
|
|
quorumN := (len(config.Nodes) / 2) + 1
|
|
if len(indexes) >= quorumN {
|
|
// Return highest index reported by quorum.
|
|
sort.Sort(sort.Reverse(uint64Slice(indexes)))
|
|
committed <- indexes[quorumN-1]
|
|
l.tracef("send heartbeat: commit: idx=%d, idxs=%+v", commitIndex, indexes)
|
|
} else {
|
|
l.tracef("send heartbeat: no quorum: idxs=%+v", indexes)
|
|
close(committed)
|
|
}
|
|
close(ch)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
type heartbeatResponse struct {
|
|
peerTerm uint64
|
|
peerIndex uint64
|
|
}
|
|
|
|
// Apply executes a command against the log.
|
|
// This function returns once the command has been committed to the log.
|
|
func (l *Log) Apply(command []byte) (uint64, error) {
|
|
return l.internalApply(LogEntryCommand, command)
|
|
}
|
|
|
|
func (l *Log) internalApply(typ LogEntryType, command []byte) (index uint64, err error) {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Do not apply if this node is not the leader.
|
|
if l.state != Leader {
|
|
return 0, ErrNotLeader
|
|
}
|
|
|
|
// Create log entry.
|
|
e := &LogEntry{
|
|
Type: typ,
|
|
Index: l.lastLogIndex + 1,
|
|
Term: l.term,
|
|
Data: command,
|
|
}
|
|
index = e.Index
|
|
|
|
// Append to the log.
|
|
if err := l.append(e); err != nil {
|
|
return 0, fmt.Errorf("append: %s", err)
|
|
}
|
|
|
|
// If there is no config or only one node then move commit index forward.
|
|
if l.config == nil || len(l.config.Nodes) <= 1 {
|
|
l.commitIndex = l.lastLogIndex
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// Wait blocks until a given index is applied.
|
|
func (l *Log) Wait(idx uint64) error {
|
|
// TODO(benbjohnson): Check for leadership change (?).
|
|
// TODO(benbjohnson): Add timeout.
|
|
|
|
for {
|
|
l.lock()
|
|
state, index := l.state, l.FSM.Index()
|
|
l.unlock()
|
|
|
|
if state == Stopped {
|
|
return ErrClosed
|
|
} else if index >= idx {
|
|
return nil
|
|
}
|
|
time.Sleep(WaitInterval)
|
|
}
|
|
}
|
|
|
|
// waitCommitted blocks until a given committed index is reached.
|
|
func (l *Log) waitCommitted(index uint64) error {
|
|
for {
|
|
l.lock()
|
|
state, committedIndex := l.state, l.commitIndex
|
|
l.unlock()
|
|
|
|
if state == Stopped {
|
|
return ErrClosed
|
|
} else if committedIndex >= index {
|
|
return nil
|
|
}
|
|
time.Sleep(WaitInterval)
|
|
}
|
|
}
|
|
|
|
// waitUncommitted blocks until a given uncommitted index is reached.
|
|
func (l *Log) waitUncommitted(index uint64) error {
|
|
for {
|
|
l.lock()
|
|
lastLogIndex := l.lastLogIndex
|
|
//l.tracef("waitUncommitted: %s / %d", l.state, l.lastLogIndex)
|
|
l.unlock()
|
|
|
|
if lastLogIndex >= index {
|
|
return nil
|
|
}
|
|
time.Sleep(WaitInterval)
|
|
}
|
|
}
|
|
|
|
// append adds a log entry to the list of entries.
|
|
func (l *Log) append(e *LogEntry) error {
|
|
// Exit if log is not in a running state.
|
|
// Ignore replayed entries.
|
|
if l.state == Stopped {
|
|
return ErrClosed
|
|
} else if e.Index <= l.lastLogIndex {
|
|
return nil
|
|
}
|
|
|
|
// If the entry is not the next then the cluster may have changed leaders.
|
|
// Attempt to trim the log to the index if it is not committed yet.
|
|
if e.Index > l.lastLogIndex+1 {
|
|
if e.Index >= l.commitIndex {
|
|
l.truncateTo(e.Index)
|
|
} else if e.Index < l.commitIndex {
|
|
l.lastLogIndex = 0
|
|
return ErrSnapshotRequired
|
|
}
|
|
}
|
|
|
|
assert(e.Index == l.lastLogIndex+1, "log entry skipped(%d): idx=%d, prev=%d", l.id, e.Index, l.lastLogIndex)
|
|
|
|
// Encode entry to a byte slice.
|
|
buf := make([]byte, logEntryHeaderSize+len(e.Data))
|
|
copy(buf, e.encodedHeader())
|
|
copy(buf[logEntryHeaderSize:], e.Data)
|
|
|
|
// Add to pending entries list to wait to be applied.
|
|
l.entries = append(l.entries, e)
|
|
l.lastLogIndex = e.Index
|
|
l.lastLogTerm = e.Term
|
|
|
|
// Write to tailing writers.
|
|
l.appendToWriters(buf)
|
|
|
|
return nil
|
|
}
|
|
|
|
// appendToWriters writes a byte slice to all attached writers.
|
|
func (l *Log) appendToWriters(buf []byte) {
|
|
for i := 0; i < len(l.writers); i++ {
|
|
w := l.writers[i]
|
|
|
|
// If an error occurs then remove the writer and close it.
|
|
if _, err := w.Write(buf); err != nil {
|
|
l.Logger.Printf("append to writers error: %s", err)
|
|
l.removeWriter(w)
|
|
i--
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// applier runs in a separate goroutine and applies all entries between the
|
|
// previously applied index and the current commit index.
|
|
func (l *Log) applier(closing <-chan struct{}) {
|
|
defer l.wg.Done()
|
|
|
|
for {
|
|
// Wait for a close signal or timeout.
|
|
var confirm chan struct{}
|
|
select {
|
|
case <-closing:
|
|
return
|
|
|
|
case confirm = <-l.Clock.AfterApplyInterval():
|
|
}
|
|
|
|
//l.tracef("applier")
|
|
|
|
// Keep applying the next entry until there are no more committed
|
|
// entries that have not been applied to the state machine.
|
|
for {
|
|
if err := l.applyNextUnappliedEntry(closing); err == errClosing {
|
|
break
|
|
} else if err != nil {
|
|
panic(err.Error())
|
|
}
|
|
}
|
|
|
|
// Trim entries.
|
|
l.lock()
|
|
l.trim()
|
|
l.unlock()
|
|
|
|
// Signal clock that apply is done.
|
|
close(confirm)
|
|
}
|
|
}
|
|
|
|
// applyNextUnappliedEntry applies the next committed entry that has not yet been applied.
|
|
func (l *Log) applyNextUnappliedEntry(closing <-chan struct{}) error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Verify, under lock, that we're not closing.
|
|
select {
|
|
case <-closing:
|
|
return errClosing
|
|
default:
|
|
}
|
|
|
|
// Ignore if there are no entries in the log.
|
|
if len(l.entries) == 0 {
|
|
return errClosing
|
|
}
|
|
|
|
// Determine next index to apply.
|
|
// Ignore if next index is after the commit index.
|
|
// Ignore if the entry is not streamed to the log yet.
|
|
index := l.FSM.Index() + 1
|
|
if index > l.commitIndex {
|
|
return errClosing
|
|
} else if index > l.entries[len(l.entries)-1].Index {
|
|
return errClosing
|
|
}
|
|
|
|
// Retrieve next entry.
|
|
e := l.entries[index-l.entries[0].Index]
|
|
assert(e.Index == index, "apply: index mismatch: expected=%d, actual=%d (i=%d)", index, e.Index, index-l.entries[0].Index)
|
|
|
|
// Special handling for internal log entries.
|
|
switch e.Type {
|
|
case LogEntryCommand, LogEntryNop:
|
|
case LogEntryInitialize:
|
|
l.mustApplyInitialize(e)
|
|
case LogEntryAddPeer:
|
|
l.mustApplyAddPeer(e)
|
|
case LogEntryRemovePeer:
|
|
l.mustApplyRemovePeer(e)
|
|
default:
|
|
return fmt.Errorf("unsupported command type: %d", e.Type)
|
|
}
|
|
|
|
// Apply to FSM.
|
|
if err := l.FSM.Apply(e); err != nil {
|
|
return fmt.Errorf("apply: %s", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// trim truncates the log based on the applied index and pending writers.
|
|
func (l *Log) trim() {
|
|
if len(l.entries) == 0 {
|
|
return
|
|
}
|
|
|
|
// Determine lowest index to trim to.
|
|
index := l.FSM.Index()
|
|
for _, w := range l.writers {
|
|
if w.snapshotIndex > 0 && w.snapshotIndex < index {
|
|
index = w.snapshotIndex
|
|
}
|
|
}
|
|
|
|
// Ignore if the index is lower than the first entry.
|
|
// This can occur on a new snapshot.
|
|
if index < l.entries[0].Index {
|
|
return
|
|
}
|
|
|
|
// Ignore if trimming would cause the entries to fall below the min cache size.
|
|
offset := int(index-l.entries[0].Index) - l.LogEntryCacheSize
|
|
if offset <= 0 {
|
|
return
|
|
}
|
|
|
|
// Reslice entries list.
|
|
l.entries = l.entries[offset:]
|
|
}
|
|
|
|
// mustApplyInitialize a log initialization command by parsing and setting the configuration.
|
|
func (l *Log) mustApplyInitialize(e *LogEntry) {
|
|
// Parse the configuration from the log entry.
|
|
config := &Config{}
|
|
if err := NewConfigDecoder(bytes.NewReader(e.Data)).Decode(config); err != nil {
|
|
panic("decode: " + err.Error())
|
|
}
|
|
|
|
// Set the last update index on the configuration.
|
|
config.Index = e.Index
|
|
|
|
// TODO(benbjohnson): Lock the log while we update the configuration.
|
|
|
|
// Perist the configuration to disk.
|
|
if err := l.writeConfig(config); err != nil {
|
|
panic("write config: " + err.Error())
|
|
}
|
|
l.config = config
|
|
}
|
|
|
|
// mustApplyAddPeer adds a node to the cluster configuration.
|
|
func (l *Log) mustApplyAddPeer(e *LogEntry) {
|
|
// Unmarshal node from entry data.
|
|
var n *ConfigNode
|
|
if err := json.Unmarshal(e.Data, &n); err != nil {
|
|
panic("unmarshal: " + err.Error())
|
|
}
|
|
|
|
// Clone configuration.
|
|
config := l.config.Clone()
|
|
|
|
// Increment the node identifier.
|
|
config.MaxNodeID++
|
|
n.ID = config.MaxNodeID
|
|
|
|
// Add node to configuration.
|
|
if err := config.AddNode(n.ID, n.URL); err != nil {
|
|
l.Logger.Panicf("apply: add node: %s", err)
|
|
}
|
|
|
|
// Set configuration index.
|
|
config.Index = e.Index
|
|
|
|
// Write configuration.
|
|
if err := l.writeConfig(config); err != nil {
|
|
panic("write config: " + err.Error())
|
|
}
|
|
l.config = config
|
|
}
|
|
|
|
// mustApplyRemovePeer removes a node from the cluster configuration.
|
|
func (l *Log) mustApplyRemovePeer(e *LogEntry) error {
|
|
// TODO(benbjohnson): Clone configuration.
|
|
// TODO(benbjohnson): Remove node from configuration.
|
|
// TODO(benbjohnson): Set configuration index.
|
|
// TODO(benbjohnson): Write configuration.
|
|
return nil
|
|
}
|
|
|
|
// AddPeer creates a new peer in the cluster.
|
|
// Returns the new peer's identifier and the current configuration.
|
|
func (l *Log) AddPeer(u url.URL) (uint64, uint64, *Config, error) {
|
|
// Validate URL.
|
|
if u.Host == "" {
|
|
return 0, 0, nil, fmt.Errorf("peer url required")
|
|
}
|
|
|
|
// Apply command.
|
|
b, _ := json.Marshal(&ConfigNode{URL: u})
|
|
index, err := l.internalApply(LogEntryAddPeer, b)
|
|
if err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
if err := l.Wait(index); err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
|
|
// Lock while we look up the node.
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Look up node.
|
|
n := l.config.NodeByURL(u)
|
|
if n == nil {
|
|
return 0, 0, nil, fmt.Errorf("node not found")
|
|
}
|
|
|
|
return n.ID, l.leaderID, l.config.Clone(), nil
|
|
}
|
|
|
|
// RemovePeer removes an existing peer from the cluster by id.
|
|
func (l *Log) RemovePeer(id uint64) error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// TODO(benbjohnson): Apply removePeerCommand.
|
|
|
|
return nil
|
|
}
|
|
|
|
// Heartbeat establishes dominance by the current leader.
|
|
// Returns the current term and highest written log entry index.
|
|
func (l *Log) Heartbeat(term, commitIndex, leaderID uint64) (currentIndex uint64, err error) {
|
|
// Ignore if snapshotting.
|
|
if l.isSnapshotting() {
|
|
return 0, ErrSnapshotting
|
|
}
|
|
|
|
// Otherwise obtain lock and process heartbeat.
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Check if log is closed.
|
|
if !l.opened() || l.state == Stopped {
|
|
l.Logger.Printf("recv heartbeat: closed")
|
|
return 0, ErrClosed
|
|
}
|
|
|
|
// Ignore if the incoming term is less than the log's term.
|
|
if term < l.term {
|
|
l.Logger.Printf("recv heartbeat: stale term, ignore: %d < %d", term, l.term)
|
|
return l.lastLogIndex, ErrStaleTerm
|
|
}
|
|
|
|
// Send heartbeat to channel for the state loop to process.
|
|
select {
|
|
case l.heartbeats <- heartbeat{term: term, commitIndex: commitIndex, leaderID: leaderID}:
|
|
default:
|
|
}
|
|
|
|
l.tracef("recv heartbeat: (term=%d, commit=%d, leaderID: %d) (index=%d, term=%d)", term, commitIndex, leaderID, l.lastLogIndex, l.term)
|
|
return l.lastLogIndex, nil
|
|
}
|
|
|
|
// RequestVote requests a vote from the log.
|
|
func (l *Log) RequestVote(term, candidateID, lastLogIndex, lastLogTerm uint64) (peerTerm uint64, err error) {
|
|
// Ignore if snapshotting.
|
|
if l.isSnapshotting() {
|
|
return 0, ErrSnapshotting
|
|
}
|
|
|
|
// Otherwise obtain lock and process vote.
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Check if log is closed.
|
|
if !l.opened() {
|
|
return l.term, ErrClosed
|
|
}
|
|
|
|
defer func() {
|
|
l.Logger.Printf("recv req vote(term=%d, candidateID=%d, lastLogIndex=%d, lastLogTerm=%d) (err=%v)", term, candidateID, lastLogIndex, lastLogTerm, err)
|
|
}()
|
|
|
|
// Deny vote if:
|
|
// 1. Candidate is requesting a vote from an earlier term. (§5.1)
|
|
// 2. Already voted for a different candidate in this term. (§5.2)
|
|
// 3. Candidate log is less up-to-date than local log. (§5.4)
|
|
if term < l.term {
|
|
return l.term, ErrStaleTerm
|
|
} else if term == l.term && l.votedFor != 0 && l.votedFor != candidateID {
|
|
return l.term, ErrAlreadyVoted
|
|
}
|
|
|
|
// Notify term change.
|
|
l.mustSetTermIfHigher(term)
|
|
|
|
// Reject request if log is out of date.
|
|
if lastLogTerm < l.lastLogTerm {
|
|
return l.term, ErrOutOfDateLog
|
|
} else if lastLogTerm == l.lastLogTerm && lastLogIndex < l.lastLogIndex {
|
|
return l.term, ErrOutOfDateLog
|
|
}
|
|
|
|
// Vote for candidate.
|
|
l.votedFor = candidateID
|
|
|
|
return l.term, nil
|
|
}
|
|
|
|
// WriteEntriesTo attaches a writer to the log from a given index.
|
|
// The index specified must be a committed index.
|
|
func (l *Log) WriteEntriesTo(w io.Writer, id, term, index uint64) error {
|
|
// Validate and initialize the writer.
|
|
writer, err := l.initWriter(w, id, term, index)
|
|
if err != nil {
|
|
l.Logger.Printf("unable to init writer: %s", err)
|
|
return err
|
|
}
|
|
|
|
// Write the snapshot and advance the writer through the log.
|
|
// If an error occurs then remove the writer.
|
|
if err := l.writeTo(writer, id, term, index); err != nil {
|
|
l.lock()
|
|
l.removeWriter(writer)
|
|
l.unlock()
|
|
l.Logger.Printf("unable to write entries: %s", err)
|
|
return err
|
|
}
|
|
|
|
// Wait for writer to finish.
|
|
<-writer.done
|
|
return nil
|
|
}
|
|
|
|
// validates writer and adds it to the list of writers.
|
|
func (l *Log) initWriter(w io.Writer, id, term, index uint64) (*logWriter, error) {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Check if log is closed.
|
|
if !l.opened() {
|
|
return nil, ErrClosed
|
|
}
|
|
|
|
// Do not begin streaming if:
|
|
// 1. Node is not the leader.
|
|
// 2. Term is after current term.
|
|
if l.state != Leader {
|
|
return nil, ErrNotLeader
|
|
} else if term > l.term {
|
|
l.mustSetTermIfHigher(term)
|
|
return nil, ErrNotLeader
|
|
}
|
|
|
|
// If the index is past the leader's log then reset and begin from the end.
|
|
// The follower will check the index and trim its log as needed. If the
|
|
// follower cannot trim its log then it needs to retrieve a snapshot.
|
|
if index > l.lastLogIndex {
|
|
index = l.lastLogIndex
|
|
}
|
|
|
|
// Encode configuration.
|
|
var buf bytes.Buffer
|
|
err := NewConfigEncoder(&buf).Encode(l.config)
|
|
assert(err == nil, "marshal config error: %s", err)
|
|
|
|
// Write configuration.
|
|
if err := NewLogEntryEncoder(w).Encode(&LogEntry{Type: logEntryConfig, Data: buf.Bytes()}); err != nil {
|
|
return nil, err
|
|
}
|
|
flushWriter(w)
|
|
|
|
// Wrap writer and append to log to tail.
|
|
writer := &logWriter{
|
|
Writer: w,
|
|
id: id,
|
|
snapshotIndex: l.FSM.Index(),
|
|
done: make(chan struct{}),
|
|
}
|
|
l.writers = append(l.writers, writer)
|
|
|
|
return writer, nil
|
|
}
|
|
|
|
func (l *Log) writeTo(writer *logWriter, id, term, index uint64) error {
|
|
// Extract the underlying writer.
|
|
w := writer.Writer
|
|
|
|
// Write snapshot if the requested index is less than the snapshot index.
|
|
// Write snapshot marker byte.
|
|
if _, err := w.Write([]byte{logEntrySnapshot}); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Begin streaming the snapshot.
|
|
if _, err := l.FSM.WriteTo(w); err != nil {
|
|
return err
|
|
}
|
|
flushWriter(w)
|
|
|
|
// Write entries since the snapshot occurred and begin tailing writer.
|
|
if err := l.advanceWriter(writer); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// replays entries since the snapshot's index and begins tailing the log.
|
|
func (l *Log) advanceWriter(writer *logWriter) error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Check if writer has been closed during snapshot.
|
|
select {
|
|
case <-writer.done:
|
|
return errors.New("writer closed during snapshot")
|
|
default:
|
|
}
|
|
|
|
// Write pending entries.
|
|
if len(l.entries) > 0 {
|
|
enc := NewLogEntryEncoder(writer.Writer)
|
|
for _, e := range l.entries[writer.snapshotIndex-l.entries[0].Index+1:] {
|
|
if err := enc.Encode(e); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Flush data.
|
|
flushWriter(writer.Writer)
|
|
|
|
// Clear snapshot index on writer.
|
|
writer.snapshotIndex = 0
|
|
|
|
return nil
|
|
}
|
|
|
|
// removeWriter removes a writer from the list of log writers.
|
|
func (l *Log) removeWriter(writer *logWriter) {
|
|
l.tracef("removeWriter")
|
|
for i, w := range l.writers {
|
|
if w == writer {
|
|
copy(l.writers[i:], l.writers[i+1:])
|
|
l.writers[len(l.writers)-1] = nil
|
|
l.writers = l.writers[:len(l.writers)-1]
|
|
_ = w.Close()
|
|
l.Logger.Printf("writer removed: %#v", w)
|
|
break
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// ReadFrom continually reads log entries from a reader.
|
|
func (l *Log) ReadFrom(r io.ReadCloser) error {
|
|
l.tracef("ReadFrom")
|
|
if err := l.initReadFrom(r); err == errTransitioning {
|
|
return err
|
|
} else if err != nil {
|
|
return fmt.Errorf("init read from: %s", err)
|
|
}
|
|
|
|
// If a nil reader is passed in then exit.
|
|
if r == nil {
|
|
return nil
|
|
}
|
|
|
|
l.Logger.Printf("reading from stream")
|
|
|
|
// Continually decode entries.
|
|
dec := NewLogEntryDecoder(r)
|
|
for {
|
|
// Decode single entry.
|
|
e := &LogEntry{}
|
|
if err := dec.Decode(e); err == io.EOF {
|
|
return nil
|
|
} else if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Apply special config & snapshot entries immediately.
|
|
// All other entries get appended to the log.
|
|
switch e.Type {
|
|
case logEntryConfig:
|
|
l.tracef("ReadFrom: config")
|
|
if err := l.applyConfigLogEntry(e); err != nil {
|
|
l.Logger.Printf("error reading config from stream: %s", err)
|
|
return fmt.Errorf("apply config log entry: %s", err)
|
|
}
|
|
|
|
case logEntrySnapshot:
|
|
if err := l.applySnapshotLogEntry(e, r); err != nil {
|
|
l.Logger.Printf("error snapshotting from stream: %s", err)
|
|
return fmt.Errorf("apply snapshot log entry: %s", err)
|
|
}
|
|
|
|
default:
|
|
// Append entry to the log.
|
|
if err := func() error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
if err := l.append(e); err != nil {
|
|
return fmt.Errorf("append: %s", err)
|
|
}
|
|
|
|
return nil
|
|
}(); err != nil {
|
|
l.Logger.Printf("error appending from stream: %s", err)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// applyConfigLogEntry updates the config for a config log entry.
|
|
func (l *Log) applyConfigLogEntry(e *LogEntry) error {
|
|
// Parse configuration from the log entry.
|
|
config := &Config{}
|
|
if err := NewConfigDecoder(bytes.NewReader(e.Data)).Decode(config); err != nil {
|
|
return fmt.Errorf("decode config: %s", err)
|
|
}
|
|
|
|
// Write the configuration to disk.
|
|
l.lock()
|
|
defer l.unlock()
|
|
if err := l.writeConfig(config); err != nil {
|
|
return fmt.Errorf("write config: %s", err)
|
|
}
|
|
l.config = config
|
|
|
|
return nil
|
|
}
|
|
|
|
// applySnapshotLogEntry restores a snapshot log entry.
|
|
func (l *Log) applySnapshotLogEntry(e *LogEntry, r io.Reader) error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Flag the log as snapshotting.
|
|
atomic.StoreUint32(&l.snapshotting, 1)
|
|
defer atomic.StoreUint32(&l.snapshotting, 0)
|
|
|
|
// Log snapshotting time.
|
|
start := time.Now()
|
|
l.Logger.Printf("applying snapshot: begin")
|
|
defer func() { l.Logger.Printf("applying snapshot: done (%s)", time.Since(start)) }()
|
|
|
|
// Let the FSM rebuild its state from the data in r.
|
|
if _, err := l.FSM.ReadFrom(r); err != nil {
|
|
return fmt.Errorf("fsm restore: %s", err)
|
|
}
|
|
|
|
// Update the indicies & clear the entries.
|
|
index := l.FSM.Index()
|
|
l.lastLogIndex = index
|
|
l.commitIndex = index
|
|
l.entries = nil
|
|
|
|
return nil
|
|
}
|
|
|
|
// Initializes the ReadFrom() call under a lock and swaps out the readers.
|
|
func (l *Log) initReadFrom(r io.ReadCloser) error {
|
|
l.lock()
|
|
defer l.unlock()
|
|
|
|
// Check if log is closed.
|
|
if !l.opened() {
|
|
return ErrClosed
|
|
}
|
|
|
|
// Close previous reader & set new one.
|
|
if err := l.setReader(r); err == errTransitioning {
|
|
return err
|
|
} else if err != nil {
|
|
return fmt.Errorf("set reader: %s", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// heartbeat represents an incoming heartbeat.
|
|
type heartbeat struct {
|
|
term uint64
|
|
commitIndex uint64
|
|
leaderID uint64
|
|
}
|
|
|
|
// logWriter wraps writers to provide a channel for close notification.
|
|
type logWriter struct {
|
|
io.Writer
|
|
id uint64 // target's log id
|
|
snapshotIndex uint64 // snapshot index, if zero then ignored.
|
|
done chan struct{} // close notification
|
|
}
|
|
|
|
// Write writes bytes to the underlying writer.
|
|
func (w *logWriter) Write(p []byte) (int, error) {
|
|
// Ignore if the writer is currently snapshotting.
|
|
if w.snapshotIndex != 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
// Write to underlying writer.
|
|
n, err := w.Writer.Write(p)
|
|
if err != nil {
|
|
return n, err
|
|
}
|
|
|
|
// Flush writer if successful.
|
|
flushWriter(w.Writer)
|
|
return n, err
|
|
}
|
|
|
|
func (w *logWriter) Close() error {
|
|
w.snapshotIndex = 0
|
|
close(w.done)
|
|
return nil
|
|
}
|
|
|
|
// flushes data for writers that implement HTTP.Flusher.
|
|
func flushWriter(w io.Writer) {
|
|
if w, ok := w.(http.Flusher); ok {
|
|
w.Flush()
|
|
}
|
|
}
|
|
|
|
// LogEntryType serves as an internal marker for log entries.
|
|
// Non-command entry types are handled by the library itself.
|
|
type LogEntryType uint8
|
|
|
|
const (
|
|
LogEntryCommand LogEntryType = iota
|
|
LogEntryNop
|
|
LogEntryInitialize
|
|
LogEntryAddPeer
|
|
LogEntryRemovePeer
|
|
|
|
// Internal entry types.
|
|
logEntryConfig = 254
|
|
logEntrySnapshot = 255
|
|
)
|
|
|
|
// LogEntry represents a single command within the log.
|
|
type LogEntry struct {
|
|
Type LogEntryType
|
|
Index uint64
|
|
Term uint64
|
|
Data []byte
|
|
}
|
|
|
|
// encodedHeader returns the encoded header for the entry.
|
|
func (e *LogEntry) encodedHeader() []byte {
|
|
var b [logEntryHeaderSize]byte
|
|
binary.BigEndian.PutUint64(b[0:8], (uint64(e.Type)<<56)|uint64(len(e.Data)))
|
|
binary.BigEndian.PutUint64(b[8:16], e.Index)
|
|
binary.BigEndian.PutUint64(b[16:24], e.Term)
|
|
return b[:]
|
|
}
|
|
|
|
type uint64Slice []uint64
|
|
|
|
func (p uint64Slice) Len() int { return len(p) }
|
|
func (p uint64Slice) Less(i, j int) bool { return p[i] < p[j] }
|
|
func (p uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
|
|
|
func assert(condition bool, msg string, v ...interface{}) {
|
|
if !condition {
|
|
panic(fmt.Sprintf("assert failed: "+msg, v...))
|
|
}
|
|
}
|
|
|
|
func warn(v ...interface{}) { fmt.Fprintln(os.Stderr, v...) }
|
|
func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) }
|
|
|
|
func printstack() {
|
|
stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n")
|
|
fmt.Fprintln(os.Stderr, stack)
|
|
}
|