517 lines
12 KiB
Go
517 lines
12 KiB
Go
package meta
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/raft"
|
|
"github.com/hashicorp/raft-boltdb"
|
|
)
|
|
|
|
// raftState abstracts the interaction of the raft consensus layer
|
|
// across local or remote nodes. It is a form of the state design pattern and allows
|
|
// the meta.Store to change its behavior with the raft layer at runtime.
|
|
type raftState interface {
|
|
open() error
|
|
remove() error
|
|
initialize() error
|
|
leader() string
|
|
isLeader() bool
|
|
sync(index uint64, timeout time.Duration) error
|
|
setPeers(addrs []string) error
|
|
addPeer(addr string) error
|
|
removePeer(addr string) error
|
|
peers() ([]string, error)
|
|
invalidate() error
|
|
close() error
|
|
lastIndex() uint64
|
|
apply(b []byte) error
|
|
snapshot() error
|
|
isLocal() bool
|
|
}
|
|
|
|
// localRaft is a consensus strategy that uses a local raft implementation for
|
|
// consensus operations.
|
|
type localRaft struct {
|
|
wg sync.WaitGroup
|
|
closing chan struct{}
|
|
store *Store
|
|
raft *raft.Raft
|
|
transport *raft.NetworkTransport
|
|
peerStore raft.PeerStore
|
|
raftStore *raftboltdb.BoltStore
|
|
raftLayer *raftLayer
|
|
}
|
|
|
|
func (r *localRaft) remove() error {
|
|
if err := os.RemoveAll(filepath.Join(r.store.path, "raft.db")); err != nil {
|
|
return err
|
|
}
|
|
if err := os.RemoveAll(filepath.Join(r.store.path, "peers.json")); err != nil {
|
|
return err
|
|
}
|
|
if err := os.RemoveAll(filepath.Join(r.store.path, "snapshots")); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (r *localRaft) updateMetaData(ms *Data) {
|
|
if ms == nil {
|
|
return
|
|
}
|
|
|
|
updated := false
|
|
r.store.mu.RLock()
|
|
if ms.Index > r.store.data.Index {
|
|
updated = true
|
|
}
|
|
r.store.mu.RUnlock()
|
|
|
|
if updated {
|
|
r.store.Logger.Printf("Updating metastore to term=%v index=%v", ms.Term, ms.Index)
|
|
r.store.mu.Lock()
|
|
r.store.data = ms
|
|
// Signal any blocked goroutines that the meta store has been updated
|
|
r.store.notifyChanged()
|
|
r.store.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
func (r *localRaft) invalidate() error {
|
|
if r.store.IsLeader() {
|
|
return nil
|
|
}
|
|
|
|
ms, err := r.store.rpc.fetchMetaData(false)
|
|
if err != nil {
|
|
return fmt.Errorf("error fetching meta data: %s", err)
|
|
}
|
|
|
|
r.updateMetaData(ms)
|
|
return nil
|
|
}
|
|
|
|
func (r *localRaft) open() error {
|
|
r.closing = make(chan struct{})
|
|
|
|
s := r.store
|
|
// Setup raft configuration.
|
|
config := raft.DefaultConfig()
|
|
config.LogOutput = ioutil.Discard
|
|
|
|
if s.clusterTracingEnabled {
|
|
config.Logger = s.Logger
|
|
}
|
|
config.HeartbeatTimeout = s.HeartbeatTimeout
|
|
config.ElectionTimeout = s.ElectionTimeout
|
|
config.LeaderLeaseTimeout = s.LeaderLeaseTimeout
|
|
config.CommitTimeout = s.CommitTimeout
|
|
// Since we actually never call `removePeer` this is safe.
|
|
// If in the future we decide to call remove peer we have to re-evaluate how to handle this
|
|
config.ShutdownOnRemove = false
|
|
|
|
// If no peers are set in the config or there is one and we are it, then start as a single server.
|
|
if len(s.peers) <= 1 {
|
|
config.EnableSingleNode = true
|
|
// Ensure we can always become the leader
|
|
config.DisableBootstrapAfterElect = false
|
|
}
|
|
|
|
// Build raft layer to multiplex listener.
|
|
r.raftLayer = newRaftLayer(s.RaftListener, s.RemoteAddr)
|
|
|
|
// Create a transport layer
|
|
r.transport = raft.NewNetworkTransport(r.raftLayer, 3, 10*time.Second, config.LogOutput)
|
|
|
|
// Create peer storage.
|
|
r.peerStore = raft.NewJSONPeers(s.path, r.transport)
|
|
|
|
peers, err := r.peerStore.Peers()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// For single-node clusters, we can update the raft peers before we start the cluster if the hostname
|
|
// has changed.
|
|
if config.EnableSingleNode {
|
|
if err := r.peerStore.SetPeers([]string{s.RemoteAddr.String()}); err != nil {
|
|
return err
|
|
}
|
|
peers = []string{s.RemoteAddr.String()}
|
|
}
|
|
|
|
// If we have multiple nodes in the cluster, make sure our address is in the raft peers or
|
|
// we won't be able to boot into the cluster because the other peers will reject our new hostname. This
|
|
// is difficult to resolve automatically because we need to have all the raft peers agree on the current members
|
|
// of the cluster before we can change them.
|
|
if len(peers) > 0 && !raft.PeerContained(peers, s.RemoteAddr.String()) {
|
|
s.Logger.Printf("%s is not in the list of raft peers. Please update %v/peers.json on all raft nodes to have the same contents.", s.RemoteAddr.String(), s.Path())
|
|
return fmt.Errorf("peers out of sync: %v not in %v", s.RemoteAddr.String(), peers)
|
|
}
|
|
|
|
// Create the log store and stable store.
|
|
store, err := raftboltdb.NewBoltStore(filepath.Join(s.path, "raft.db"))
|
|
if err != nil {
|
|
return fmt.Errorf("new bolt store: %s", err)
|
|
}
|
|
r.raftStore = store
|
|
|
|
// Create the snapshot store.
|
|
snapshots, err := raft.NewFileSnapshotStore(s.path, raftSnapshotsRetained, os.Stderr)
|
|
if err != nil {
|
|
return fmt.Errorf("file snapshot store: %s", err)
|
|
}
|
|
|
|
// Create raft log.
|
|
ra, err := raft.NewRaft(config, (*storeFSM)(s), store, store, snapshots, r.peerStore, r.transport)
|
|
if err != nil {
|
|
return fmt.Errorf("new raft: %s", err)
|
|
}
|
|
r.raft = ra
|
|
|
|
r.wg.Add(1)
|
|
go r.logLeaderChanges()
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *localRaft) logLeaderChanges() {
|
|
defer r.wg.Done()
|
|
// Logs our current state (Node at 1.2.3.4:8088 [Follower])
|
|
r.store.Logger.Printf(r.raft.String())
|
|
for {
|
|
select {
|
|
case <-r.closing:
|
|
return
|
|
case <-r.raft.LeaderCh():
|
|
peers, err := r.peers()
|
|
if err != nil {
|
|
r.store.Logger.Printf("failed to lookup peers: %v", err)
|
|
}
|
|
r.store.Logger.Printf("%v. peers=%v", r.raft.String(), peers)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r *localRaft) close() error {
|
|
if r.closing != nil {
|
|
close(r.closing)
|
|
}
|
|
r.wg.Wait()
|
|
|
|
if r.transport != nil {
|
|
r.transport.Close()
|
|
r.transport = nil
|
|
}
|
|
|
|
// Shutdown raft.
|
|
if r.raft != nil {
|
|
if err := r.raft.Shutdown().Error(); err != nil {
|
|
return err
|
|
}
|
|
r.raft = nil
|
|
}
|
|
|
|
if r.raftStore != nil {
|
|
r.raftStore.Close()
|
|
r.raftStore = nil
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *localRaft) initialize() error {
|
|
s := r.store
|
|
// If we have committed entries then the store is already in the cluster.
|
|
if index, err := r.raftStore.LastIndex(); err != nil {
|
|
return fmt.Errorf("last index: %s", err)
|
|
} else if index > 0 {
|
|
return nil
|
|
}
|
|
|
|
// Force set peers.
|
|
if err := r.setPeers(s.peers); err != nil {
|
|
return fmt.Errorf("set raft peers: %s", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// apply applies a serialized command to the raft log.
|
|
func (r *localRaft) apply(b []byte) error {
|
|
// Apply to raft log.
|
|
f := r.raft.Apply(b, 0)
|
|
if err := f.Error(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Return response if it's an error.
|
|
// No other non-nil objects should be returned.
|
|
resp := f.Response()
|
|
if err, ok := resp.(error); ok {
|
|
return lookupError(err)
|
|
}
|
|
assert(resp == nil, "unexpected response: %#v", resp)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *localRaft) lastIndex() uint64 {
|
|
return r.raft.LastIndex()
|
|
}
|
|
|
|
func (r *localRaft) sync(index uint64, timeout time.Duration) error {
|
|
ticker := time.NewTicker(100 * time.Millisecond)
|
|
defer ticker.Stop()
|
|
|
|
timer := time.NewTimer(timeout)
|
|
defer timer.Stop()
|
|
|
|
for {
|
|
// Wait for next tick or timeout.
|
|
select {
|
|
case <-ticker.C:
|
|
case <-timer.C:
|
|
return errors.New("timeout")
|
|
}
|
|
|
|
// Compare index against current metadata.
|
|
r.store.mu.Lock()
|
|
ok := (r.store.data.Index >= index)
|
|
r.store.mu.Unlock()
|
|
|
|
// Exit if we are at least at the given index.
|
|
if ok {
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
func (r *localRaft) snapshot() error {
|
|
future := r.raft.Snapshot()
|
|
return future.Error()
|
|
}
|
|
|
|
// addPeer adds addr to the list of peers in the cluster.
|
|
func (r *localRaft) addPeer(addr string) error {
|
|
peers, err := r.peerStore.Peers()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(peers) >= 3 {
|
|
return nil
|
|
}
|
|
|
|
if fut := r.raft.AddPeer(addr); fut.Error() != nil {
|
|
return fut.Error()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// removePeer removes addr from the list of peers in the cluster.
|
|
func (r *localRaft) removePeer(addr string) error {
|
|
// Only do this on the leader
|
|
if !r.isLeader() {
|
|
return errors.New("not the leader")
|
|
}
|
|
if fut := r.raft.RemovePeer(addr); fut.Error() != nil {
|
|
return fut.Error()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setPeers sets a list of peers in the cluster.
|
|
func (r *localRaft) setPeers(addrs []string) error {
|
|
return r.raft.SetPeers(addrs).Error()
|
|
}
|
|
|
|
func (r *localRaft) peers() ([]string, error) {
|
|
return r.peerStore.Peers()
|
|
}
|
|
|
|
func (r *localRaft) leader() string {
|
|
if r.raft == nil {
|
|
return ""
|
|
}
|
|
|
|
return r.raft.Leader()
|
|
}
|
|
|
|
func (r *localRaft) isLeader() bool {
|
|
if r.raft == nil {
|
|
return false
|
|
}
|
|
return r.raft.State() == raft.Leader
|
|
}
|
|
|
|
func (r *localRaft) isLocal() bool {
|
|
return true
|
|
}
|
|
|
|
// remoteRaft is a consensus strategy that uses a remote raft cluster for
|
|
// consensus operations.
|
|
type remoteRaft struct {
|
|
store *Store
|
|
}
|
|
|
|
func (r *remoteRaft) remove() error {
|
|
return nil
|
|
}
|
|
|
|
func (r *remoteRaft) updateMetaData(ms *Data) {
|
|
if ms == nil {
|
|
return
|
|
}
|
|
|
|
updated := false
|
|
r.store.mu.RLock()
|
|
if ms.Index > r.store.data.Index {
|
|
updated = true
|
|
}
|
|
r.store.mu.RUnlock()
|
|
|
|
if updated {
|
|
r.store.Logger.Printf("Updating metastore to term=%v index=%v", ms.Term, ms.Index)
|
|
r.store.mu.Lock()
|
|
r.store.data = ms
|
|
// Signal any blocked goroutines that the meta store has been updated
|
|
r.store.notifyChanged()
|
|
r.store.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
func (r *remoteRaft) invalidate() error {
|
|
ms, err := r.store.rpc.fetchMetaData(false)
|
|
if err != nil {
|
|
return fmt.Errorf("error fetching meta data: %s", err)
|
|
}
|
|
|
|
r.updateMetaData(ms)
|
|
return nil
|
|
}
|
|
|
|
func (r *remoteRaft) setPeers(addrs []string) error {
|
|
// Convert to JSON
|
|
var buf bytes.Buffer
|
|
enc := json.NewEncoder(&buf)
|
|
if err := enc.Encode(addrs); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Write out as JSON
|
|
return ioutil.WriteFile(filepath.Join(r.store.path, "peers.json"), buf.Bytes(), 0755)
|
|
}
|
|
|
|
// addPeer adds addr to the list of peers in the cluster.
|
|
func (r *remoteRaft) addPeer(addr string) error {
|
|
return fmt.Errorf("cannot add peer using remote raft")
|
|
}
|
|
|
|
// removePeer does nothing for remoteRaft.
|
|
func (r *remoteRaft) removePeer(addr string) error {
|
|
return nil
|
|
}
|
|
|
|
func (r *remoteRaft) peers() ([]string, error) {
|
|
return readPeersJSON(filepath.Join(r.store.path, "peers.json"))
|
|
}
|
|
|
|
func (r *remoteRaft) open() error {
|
|
if err := r.setPeers(r.store.peers); err != nil {
|
|
return err
|
|
}
|
|
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-r.store.closing:
|
|
return
|
|
default:
|
|
}
|
|
|
|
ms, err := r.store.rpc.fetchMetaData(true)
|
|
if err != nil {
|
|
r.store.Logger.Printf("fetch metastore: %v", err)
|
|
time.Sleep(time.Second)
|
|
continue
|
|
}
|
|
r.updateMetaData(ms)
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
func (r *remoteRaft) close() error {
|
|
return nil
|
|
}
|
|
|
|
// apply applies a serialized command to the raft log.
|
|
func (r *remoteRaft) apply(b []byte) error {
|
|
return fmt.Errorf("cannot apply log while in remote raft state")
|
|
}
|
|
|
|
func (r *remoteRaft) initialize() error {
|
|
return nil
|
|
}
|
|
|
|
func (r *remoteRaft) leader() string {
|
|
if len(r.store.peers) == 0 {
|
|
return ""
|
|
}
|
|
|
|
return r.store.peers[rand.Intn(len(r.store.peers))]
|
|
}
|
|
|
|
func (r *remoteRaft) isLeader() bool {
|
|
return false
|
|
}
|
|
|
|
func (r *remoteRaft) isLocal() bool {
|
|
return false
|
|
}
|
|
|
|
func (r *remoteRaft) lastIndex() uint64 {
|
|
return r.store.cachedData().Index
|
|
}
|
|
|
|
func (r *remoteRaft) sync(index uint64, timeout time.Duration) error {
|
|
//FIXME: jwilder: check index and timeout
|
|
return r.store.invalidate()
|
|
}
|
|
|
|
func (r *remoteRaft) snapshot() error {
|
|
return fmt.Errorf("cannot snapshot while in remote raft state")
|
|
}
|
|
|
|
func readPeersJSON(path string) ([]string, error) {
|
|
// Read the file
|
|
buf, err := ioutil.ReadFile(path)
|
|
if err != nil && !os.IsNotExist(err) {
|
|
return nil, err
|
|
}
|
|
|
|
// Check for no peers
|
|
if len(buf) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
// Decode the peers
|
|
var peers []string
|
|
dec := json.NewDecoder(bytes.NewReader(buf))
|
|
if err := dec.Decode(&peers); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return peers, nil
|
|
}
|