Created method to handle the node state change with cleanup operation
associated.
Realign testing client with the new diagnostic interface
Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
| ... | ... |
@@ -91,7 +91,7 @@ func (s *Server) EnableDebug(ip string, port int) {
|
| 91 | 91 |
} |
| 92 | 92 |
|
| 93 | 93 |
logrus.Infof("Starting the diagnose server listening on %d for commands", port)
|
| 94 |
- srv := &http.Server{Addr: fmt.Sprintf("127.0.0.1:%d", port), Handler: s}
|
|
| 94 |
+ srv := &http.Server{Addr: fmt.Sprintf("%s:%d", ip, port), Handler: s}
|
|
| 95 | 95 |
s.srv = srv |
| 96 | 96 |
s.enable = 1 |
| 97 | 97 |
go func(n *Server) {
|
| ... | ... |
@@ -101,7 +101,6 @@ func (s *Server) EnableDebug(ip string, port int) {
|
| 101 | 101 |
atomic.SwapInt32(&n.enable, 0) |
| 102 | 102 |
} |
| 103 | 103 |
}(s) |
| 104 |
- |
|
| 105 | 104 |
} |
| 106 | 105 |
|
| 107 | 106 |
// DisableDebug stop the dubug and closes the tcp socket |
| ... | ... |
@@ -16,46 +16,28 @@ func (d *delegate) NodeMeta(limit int) []byte {
|
| 16 | 16 |
return []byte{}
|
| 17 | 17 |
} |
| 18 | 18 |
|
| 19 |
-// getNode searches the node inside the tables |
|
| 20 |
-// returns true if the node was respectively in the active list, explicit node leave list or failed list |
|
| 21 |
-func (nDB *NetworkDB) getNode(nEvent *NodeEvent, extract bool) (bool, bool, bool, *node) {
|
|
| 22 |
- var active bool |
|
| 23 |
- var left bool |
|
| 24 |
- var failed bool |
|
| 25 |
- |
|
| 26 |
- for _, nodes := range []map[string]*node{
|
|
| 27 |
- nDB.failedNodes, |
|
| 28 |
- nDB.leftNodes, |
|
| 29 |
- nDB.nodes, |
|
| 30 |
- } {
|
|
| 31 |
- if n, ok := nodes[nEvent.NodeName]; ok {
|
|
| 32 |
- active = &nodes == &nDB.nodes |
|
| 33 |
- left = &nodes == &nDB.leftNodes |
|
| 34 |
- failed = &nodes == &nDB.failedNodes |
|
| 35 |
- if n.ltime >= nEvent.LTime {
|
|
| 36 |
- return active, left, failed, nil |
|
| 37 |
- } |
|
| 38 |
- if extract {
|
|
| 39 |
- delete(nodes, n.Name) |
|
| 40 |
- } |
|
| 41 |
- return active, left, failed, n |
|
| 42 |
- } |
|
| 43 |
- } |
|
| 44 |
- return active, left, failed, nil |
|
| 45 |
-} |
|
| 46 |
- |
|
| 47 | 19 |
func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
|
| 48 | 20 |
// Update our local clock if the received messages has newer |
| 49 | 21 |
// time. |
| 50 | 22 |
nDB.networkClock.Witness(nEvent.LTime) |
| 51 | 23 |
|
| 52 | 24 |
nDB.RLock() |
| 53 |
- active, left, _, n := nDB.getNode(nEvent, false) |
|
| 25 |
+ defer nDB.RUnlock() |
|
| 26 |
+ |
|
| 27 |
+ // check if the node exists |
|
| 28 |
+ n, _, _ := nDB.findNode(nEvent.NodeName) |
|
| 54 | 29 |
if n == nil {
|
| 55 |
- nDB.RUnlock() |
|
| 56 | 30 |
return false |
| 57 | 31 |
} |
| 58 |
- nDB.RUnlock() |
|
| 32 |
+ |
|
| 33 |
+ // check if the event is fresh |
|
| 34 |
+ if n.ltime >= nEvent.LTime {
|
|
| 35 |
+ return false |
|
| 36 |
+ } |
|
| 37 |
+ |
|
| 38 |
+ // If we are here means that the event is fresher and the node is known. Update the laport time |
|
| 39 |
+ n.ltime = nEvent.LTime |
|
| 40 |
+ |
|
| 59 | 41 |
// If it is a node leave event for a manager and this is the only manager we |
| 60 | 42 |
// know of we want the reconnect logic to kick in. In a single manager |
| 61 | 43 |
// cluster manager's gossip can't be bootstrapped unless some other node |
| ... | ... |
@@ -63,45 +45,32 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
|
| 63 | 63 |
if len(nDB.bootStrapIP) == 1 && nEvent.Type == NodeEventTypeLeave {
|
| 64 | 64 |
for _, ip := range nDB.bootStrapIP {
|
| 65 | 65 |
if ip.Equal(n.Addr) {
|
| 66 |
- n.ltime = nEvent.LTime |
|
| 67 | 66 |
return true |
| 68 | 67 |
} |
| 69 | 68 |
} |
| 70 | 69 |
} |
| 71 | 70 |
|
| 72 |
- n.ltime = nEvent.LTime |
|
| 73 |
- |
|
| 74 | 71 |
switch nEvent.Type {
|
| 75 | 72 |
case NodeEventTypeJoin: |
| 76 |
- if active {
|
|
| 77 |
- // the node is already marked as active nothing to do |
|
| 73 |
+ moved, err := nDB.changeNodeState(n.Name, nodeActiveState) |
|
| 74 |
+ if err != nil {
|
|
| 75 |
+ logrus.WithError(err).Error("unable to find the node to move")
|
|
| 78 | 76 |
return false |
| 79 | 77 |
} |
| 80 |
- nDB.Lock() |
|
| 81 |
- // Because the lock got released on the previous check we have to do it again and re verify the status of the node |
|
| 82 |
- // All of this is to avoid a big lock on the function |
|
| 83 |
- if active, _, _, n = nDB.getNode(nEvent, true); !active && n != nil {
|
|
| 84 |
- n.reapTime = 0 |
|
| 85 |
- nDB.nodes[n.Name] = n |
|
| 78 |
+ if moved {
|
|
| 86 | 79 |
logrus.Infof("%v(%v): Node join event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
|
| 87 | 80 |
} |
| 88 |
- nDB.Unlock() |
|
| 89 |
- return true |
|
| 81 |
+ return moved |
|
| 90 | 82 |
case NodeEventTypeLeave: |
| 91 |
- if left {
|
|
| 92 |
- // the node is already marked as left nothing to do. |
|
| 83 |
+ moved, err := nDB.changeNodeState(n.Name, nodeLeftState) |
|
| 84 |
+ if err != nil {
|
|
| 85 |
+ logrus.WithError(err).Error("unable to find the node to move")
|
|
| 93 | 86 |
return false |
| 94 | 87 |
} |
| 95 |
- nDB.Lock() |
|
| 96 |
- // Because the lock got released on the previous check we have to do it again and re verify the status of the node |
|
| 97 |
- // All of this is to avoid a big lock on the function |
|
| 98 |
- if _, left, _, n = nDB.getNode(nEvent, true); !left && n != nil {
|
|
| 99 |
- n.reapTime = nodeReapInterval |
|
| 100 |
- nDB.leftNodes[n.Name] = n |
|
| 88 |
+ if moved {
|
|
| 101 | 89 |
logrus.Infof("%v(%v): Node leave event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
|
| 102 | 90 |
} |
| 103 |
- nDB.Unlock() |
|
| 104 |
- return true |
|
| 91 |
+ return moved |
|
| 105 | 92 |
} |
| 106 | 93 |
|
| 107 | 94 |
return false |
| ... | ... |
@@ -21,24 +21,6 @@ func (e *eventDelegate) broadcastNodeEvent(addr net.IP, op opType) {
|
| 21 | 21 |
} |
| 22 | 22 |
} |
| 23 | 23 |
|
| 24 |
-func (e *eventDelegate) purgeReincarnation(mn *memberlist.Node) {
|
|
| 25 |
- for name, node := range e.nDB.failedNodes {
|
|
| 26 |
- if node.Addr.Equal(mn.Addr) {
|
|
| 27 |
- logrus.Infof("Node %s/%s, is the new incarnation of the failed node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
| 28 |
- delete(e.nDB.failedNodes, name) |
|
| 29 |
- return |
|
| 30 |
- } |
|
| 31 |
- } |
|
| 32 |
- |
|
| 33 |
- for name, node := range e.nDB.leftNodes {
|
|
| 34 |
- if node.Addr.Equal(mn.Addr) {
|
|
| 35 |
- logrus.Infof("Node %s/%s, is the new incarnation of the shutdown node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
| 36 |
- delete(e.nDB.leftNodes, name) |
|
| 37 |
- return |
|
| 38 |
- } |
|
| 39 |
- } |
|
| 40 |
-} |
|
| 41 |
- |
|
| 42 | 24 |
func (e *eventDelegate) NotifyJoin(mn *memberlist.Node) {
|
| 43 | 25 |
logrus.Infof("Node %s/%s, joined gossip cluster", mn.Name, mn.Addr)
|
| 44 | 26 |
e.broadcastNodeEvent(mn.Addr, opCreate) |
| ... | ... |
@@ -57,44 +39,35 @@ func (e *eventDelegate) NotifyJoin(mn *memberlist.Node) {
|
| 57 | 57 |
// Every node has a unique ID |
| 58 | 58 |
// Check on the base of the IP address if the new node that joined is actually a new incarnation of a previous |
| 59 | 59 |
// failed or shutdown one |
| 60 |
- e.purgeReincarnation(mn) |
|
| 60 |
+ e.nDB.purgeReincarnation(mn) |
|
| 61 | 61 |
|
| 62 | 62 |
e.nDB.nodes[mn.Name] = &node{Node: *mn}
|
| 63 | 63 |
logrus.Infof("Node %s/%s, added to nodes list", mn.Name, mn.Addr)
|
| 64 | 64 |
} |
| 65 | 65 |
|
| 66 | 66 |
func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) {
|
| 67 |
- var failed bool |
|
| 68 | 67 |
logrus.Infof("Node %s/%s, left gossip cluster", mn.Name, mn.Addr)
|
| 69 | 68 |
e.broadcastNodeEvent(mn.Addr, opDelete) |
| 70 |
- // The node left or failed, delete all the entries created by it. |
|
| 71 |
- // If the node was temporary down, deleting the entries will guarantee that the CREATE events will be accepted |
|
| 72 |
- // If the node instead left because was going down, then it makes sense to just delete all its state |
|
| 69 |
+ |
|
| 73 | 70 |
e.nDB.Lock() |
| 74 | 71 |
defer e.nDB.Unlock() |
| 75 |
- e.nDB.deleteNodeFromNetworks(mn.Name) |
|
| 76 |
- e.nDB.deleteNodeTableEntries(mn.Name) |
|
| 77 |
- if n, ok := e.nDB.nodes[mn.Name]; ok {
|
|
| 78 |
- delete(e.nDB.nodes, mn.Name) |
|
| 79 |
- |
|
| 80 |
- // Check if a new incarnation of the same node already joined |
|
| 81 |
- // In that case this node can simply be removed and no further action are needed |
|
| 82 |
- for name, node := range e.nDB.nodes {
|
|
| 83 |
- if node.Addr.Equal(mn.Addr) {
|
|
| 84 |
- logrus.Infof("Node %s/%s, is the new incarnation of the failed node %s/%s", name, node.Addr, mn.Name, mn.Addr)
|
|
| 85 |
- return |
|
| 86 |
- } |
|
| 87 |
- } |
|
| 88 | 72 |
|
| 89 |
- // In case of node failure, keep retrying to reconnect every retryInterval (1sec) for nodeReapInterval (24h) |
|
| 90 |
- // Explicit leave will have already removed the node from the list of nodes (nDB.nodes) and put it into the leftNodes map |
|
| 91 |
- n.reapTime = nodeReapInterval |
|
| 92 |
- e.nDB.failedNodes[mn.Name] = n |
|
| 93 |
- failed = true |
|
| 73 |
+ n, currState, _ := e.nDB.findNode(mn.Name) |
|
| 74 |
+ if n == nil {
|
|
| 75 |
+ logrus.Errorf("Node %s/%s not found in the node lists", mn.Name, mn.Addr)
|
|
| 76 |
+ return |
|
| 94 | 77 |
} |
| 95 |
- |
|
| 96 |
- if failed {
|
|
| 97 |
- logrus.Infof("Node %s/%s, added to failed nodes list", mn.Name, mn.Addr)
|
|
| 78 |
+ // if the node was active means that did not send the leave cluster message, so it's probable that |
|
| 79 |
+ // failed. Else would be already in the left list so nothing else has to be done |
|
| 80 |
+ if currState == nodeActiveState {
|
|
| 81 |
+ moved, err := e.nDB.changeNodeState(mn.Name, nodeFailedState) |
|
| 82 |
+ if err != nil {
|
|
| 83 |
+ logrus.WithError(err).Errorf("impossible condition, node %s/%s not present in the list", mn.Name, mn.Addr)
|
|
| 84 |
+ return |
|
| 85 |
+ } |
|
| 86 |
+ if moved {
|
|
| 87 |
+ logrus.Infof("Node %s/%s, added to failed nodes list", mn.Name, mn.Addr)
|
|
| 88 |
+ } |
|
| 98 | 89 |
} |
| 99 | 90 |
} |
| 100 | 91 |
|
| ... | ... |
@@ -5,6 +5,7 @@ import ( |
| 5 | 5 |
"fmt" |
| 6 | 6 |
"io/ioutil" |
| 7 | 7 |
"log" |
| 8 |
+ "net" |
|
| 8 | 9 |
"os" |
| 9 | 10 |
"sync/atomic" |
| 10 | 11 |
"testing" |
| ... | ... |
@@ -12,6 +13,7 @@ import ( |
| 12 | 12 |
|
| 13 | 13 |
"github.com/docker/docker/pkg/stringid" |
| 14 | 14 |
"github.com/docker/go-events" |
| 15 |
+ "github.com/hashicorp/memberlist" |
|
| 15 | 16 |
"github.com/sirupsen/logrus" |
| 16 | 17 |
"github.com/stretchr/testify/assert" |
| 17 | 18 |
"github.com/stretchr/testify/require" |
| ... | ... |
@@ -580,3 +582,156 @@ func TestNetworkDBGarbageCollection(t *testing.T) {
|
| 580 | 580 |
|
| 581 | 581 |
closeNetworkDBInstances(dbs) |
| 582 | 582 |
} |
| 583 |
+ |
|
| 584 |
+func TestFindNode(t *testing.T) {
|
|
| 585 |
+ dbs := createNetworkDBInstances(t, 1, "node", DefaultConfig()) |
|
| 586 |
+ |
|
| 587 |
+ dbs[0].nodes["active"] = &node{Node: memberlist.Node{Name: "active"}}
|
|
| 588 |
+ dbs[0].failedNodes["failed"] = &node{Node: memberlist.Node{Name: "failed"}}
|
|
| 589 |
+ dbs[0].leftNodes["left"] = &node{Node: memberlist.Node{Name: "left"}}
|
|
| 590 |
+ |
|
| 591 |
+ // active nodes is 2 because the testing node is in the list |
|
| 592 |
+ assert.Equal(t, 2, len(dbs[0].nodes)) |
|
| 593 |
+ assert.Equal(t, 1, len(dbs[0].failedNodes)) |
|
| 594 |
+ assert.Equal(t, 1, len(dbs[0].leftNodes)) |
|
| 595 |
+ |
|
| 596 |
+ n, currState, m := dbs[0].findNode("active")
|
|
| 597 |
+ assert.NotNil(t, n) |
|
| 598 |
+ assert.Equal(t, "active", n.Name) |
|
| 599 |
+ assert.Equal(t, nodeActiveState, currState) |
|
| 600 |
+ assert.NotNil(t, m) |
|
| 601 |
+ // delete the entry manually |
|
| 602 |
+ delete(m, "active") |
|
| 603 |
+ |
|
| 604 |
+ // test if can be still find |
|
| 605 |
+ n, currState, m = dbs[0].findNode("active")
|
|
| 606 |
+ assert.Nil(t, n) |
|
| 607 |
+ assert.Equal(t, nodeNotFound, currState) |
|
| 608 |
+ assert.Nil(t, m) |
|
| 609 |
+ |
|
| 610 |
+ n, currState, m = dbs[0].findNode("failed")
|
|
| 611 |
+ assert.NotNil(t, n) |
|
| 612 |
+ assert.Equal(t, "failed", n.Name) |
|
| 613 |
+ assert.Equal(t, nodeFailedState, currState) |
|
| 614 |
+ assert.NotNil(t, m) |
|
| 615 |
+ |
|
| 616 |
+ // find and remove |
|
| 617 |
+ n, currState, m = dbs[0].findNode("left")
|
|
| 618 |
+ assert.NotNil(t, n) |
|
| 619 |
+ assert.Equal(t, "left", n.Name) |
|
| 620 |
+ assert.Equal(t, nodeLeftState, currState) |
|
| 621 |
+ assert.NotNil(t, m) |
|
| 622 |
+ delete(m, "left") |
|
| 623 |
+ |
|
| 624 |
+ n, currState, m = dbs[0].findNode("left")
|
|
| 625 |
+ assert.Nil(t, n) |
|
| 626 |
+ assert.Equal(t, nodeNotFound, currState) |
|
| 627 |
+ assert.Nil(t, m) |
|
| 628 |
+ |
|
| 629 |
+ closeNetworkDBInstances(dbs) |
|
| 630 |
+} |
|
| 631 |
+ |
|
| 632 |
+func TestChangeNodeState(t *testing.T) {
|
|
| 633 |
+ dbs := createNetworkDBInstances(t, 1, "node", DefaultConfig()) |
|
| 634 |
+ |
|
| 635 |
+ dbs[0].nodes["node1"] = &node{Node: memberlist.Node{Name: "node1"}}
|
|
| 636 |
+ dbs[0].nodes["node2"] = &node{Node: memberlist.Node{Name: "node2"}}
|
|
| 637 |
+ dbs[0].nodes["node3"] = &node{Node: memberlist.Node{Name: "node3"}}
|
|
| 638 |
+ |
|
| 639 |
+ // active nodes is 4 because the testing node is in the list |
|
| 640 |
+ assert.Equal(t, 4, len(dbs[0].nodes)) |
|
| 641 |
+ |
|
| 642 |
+ n, currState, m := dbs[0].findNode("node1")
|
|
| 643 |
+ assert.NotNil(t, n) |
|
| 644 |
+ assert.Equal(t, nodeActiveState, currState) |
|
| 645 |
+ assert.Equal(t, "node1", n.Name) |
|
| 646 |
+ assert.NotNil(t, m) |
|
| 647 |
+ |
|
| 648 |
+ // node1 to failed |
|
| 649 |
+ dbs[0].changeNodeState("node1", nodeFailedState)
|
|
| 650 |
+ |
|
| 651 |
+ n, currState, m = dbs[0].findNode("node1")
|
|
| 652 |
+ assert.NotNil(t, n) |
|
| 653 |
+ assert.Equal(t, nodeFailedState, currState) |
|
| 654 |
+ assert.Equal(t, "node1", n.Name) |
|
| 655 |
+ assert.NotNil(t, m) |
|
| 656 |
+ assert.NotEqual(t, time.Duration(0), n.reapTime) |
|
| 657 |
+ |
|
| 658 |
+ // node1 back to active |
|
| 659 |
+ dbs[0].changeNodeState("node1", nodeActiveState)
|
|
| 660 |
+ |
|
| 661 |
+ n, currState, m = dbs[0].findNode("node1")
|
|
| 662 |
+ assert.NotNil(t, n) |
|
| 663 |
+ assert.Equal(t, nodeActiveState, currState) |
|
| 664 |
+ assert.Equal(t, "node1", n.Name) |
|
| 665 |
+ assert.NotNil(t, m) |
|
| 666 |
+ assert.Equal(t, time.Duration(0), n.reapTime) |
|
| 667 |
+ |
|
| 668 |
+ // node1 to left |
|
| 669 |
+ dbs[0].changeNodeState("node1", nodeLeftState)
|
|
| 670 |
+ dbs[0].changeNodeState("node2", nodeLeftState)
|
|
| 671 |
+ dbs[0].changeNodeState("node3", nodeLeftState)
|
|
| 672 |
+ |
|
| 673 |
+ n, currState, m = dbs[0].findNode("node1")
|
|
| 674 |
+ assert.NotNil(t, n) |
|
| 675 |
+ assert.Equal(t, nodeLeftState, currState) |
|
| 676 |
+ assert.Equal(t, "node1", n.Name) |
|
| 677 |
+ assert.NotNil(t, m) |
|
| 678 |
+ assert.NotEqual(t, time.Duration(0), n.reapTime) |
|
| 679 |
+ |
|
| 680 |
+ n, currState, m = dbs[0].findNode("node2")
|
|
| 681 |
+ assert.NotNil(t, n) |
|
| 682 |
+ assert.Equal(t, nodeLeftState, currState) |
|
| 683 |
+ assert.Equal(t, "node2", n.Name) |
|
| 684 |
+ assert.NotNil(t, m) |
|
| 685 |
+ assert.NotEqual(t, time.Duration(0), n.reapTime) |
|
| 686 |
+ |
|
| 687 |
+ n, currState, m = dbs[0].findNode("node3")
|
|
| 688 |
+ assert.NotNil(t, n) |
|
| 689 |
+ assert.Equal(t, nodeLeftState, currState) |
|
| 690 |
+ assert.Equal(t, "node3", n.Name) |
|
| 691 |
+ assert.NotNil(t, m) |
|
| 692 |
+ assert.NotEqual(t, time.Duration(0), n.reapTime) |
|
| 693 |
+ |
|
| 694 |
+ // active nodes is 1 because the testing node is in the list |
|
| 695 |
+ assert.Equal(t, 1, len(dbs[0].nodes)) |
|
| 696 |
+ assert.Equal(t, 0, len(dbs[0].failedNodes)) |
|
| 697 |
+ assert.Equal(t, 3, len(dbs[0].leftNodes)) |
|
| 698 |
+ |
|
| 699 |
+ closeNetworkDBInstances(dbs) |
|
| 700 |
+} |
|
| 701 |
+ |
|
| 702 |
+func TestNodeReincarnation(t *testing.T) {
|
|
| 703 |
+ dbs := createNetworkDBInstances(t, 1, "node", DefaultConfig()) |
|
| 704 |
+ |
|
| 705 |
+ dbs[0].nodes["node1"] = &node{Node: memberlist.Node{Name: "node1", Addr: net.ParseIP("192.168.1.1")}}
|
|
| 706 |
+ dbs[0].leftNodes["node2"] = &node{Node: memberlist.Node{Name: "node2", Addr: net.ParseIP("192.168.1.2")}}
|
|
| 707 |
+ dbs[0].failedNodes["node3"] = &node{Node: memberlist.Node{Name: "node3", Addr: net.ParseIP("192.168.1.3")}}
|
|
| 708 |
+ |
|
| 709 |
+ // active nodes is 2 because the testing node is in the list |
|
| 710 |
+ assert.Equal(t, 2, len(dbs[0].nodes)) |
|
| 711 |
+ assert.Equal(t, 1, len(dbs[0].failedNodes)) |
|
| 712 |
+ assert.Equal(t, 1, len(dbs[0].leftNodes)) |
|
| 713 |
+ |
|
| 714 |
+ b := dbs[0].purgeReincarnation(&memberlist.Node{Name: "node4", Addr: net.ParseIP("192.168.1.1")})
|
|
| 715 |
+ assert.True(t, b) |
|
| 716 |
+ dbs[0].nodes["node4"] = &node{Node: memberlist.Node{Name: "node4", Addr: net.ParseIP("192.168.1.1")}}
|
|
| 717 |
+ |
|
| 718 |
+ b = dbs[0].purgeReincarnation(&memberlist.Node{Name: "node5", Addr: net.ParseIP("192.168.1.2")})
|
|
| 719 |
+ assert.True(t, b) |
|
| 720 |
+ dbs[0].nodes["node5"] = &node{Node: memberlist.Node{Name: "node5", Addr: net.ParseIP("192.168.1.1")}}
|
|
| 721 |
+ |
|
| 722 |
+ b = dbs[0].purgeReincarnation(&memberlist.Node{Name: "node6", Addr: net.ParseIP("192.168.1.3")})
|
|
| 723 |
+ assert.True(t, b) |
|
| 724 |
+ dbs[0].nodes["node6"] = &node{Node: memberlist.Node{Name: "node6", Addr: net.ParseIP("192.168.1.1")}}
|
|
| 725 |
+ |
|
| 726 |
+ b = dbs[0].purgeReincarnation(&memberlist.Node{Name: "node6", Addr: net.ParseIP("192.168.1.10")})
|
|
| 727 |
+ assert.False(t, b) |
|
| 728 |
+ |
|
| 729 |
+ // active nodes is 1 because the testing node is in the list |
|
| 730 |
+ assert.Equal(t, 4, len(dbs[0].nodes)) |
|
| 731 |
+ assert.Equal(t, 0, len(dbs[0].failedNodes)) |
|
| 732 |
+ assert.Equal(t, 3, len(dbs[0].leftNodes)) |
|
| 733 |
+ |
|
| 734 |
+ closeNetworkDBInstances(dbs) |
|
| 735 |
+} |
| ... | ... |
@@ -399,6 +399,7 @@ func dbGetTable(ctx interface{}, w http.ResponseWriter, r *http.Request) {
|
| 399 | 399 |
Value: encodedValue, |
| 400 | 400 |
Owner: v.owner, |
| 401 | 401 |
}) |
| 402 |
+ i++ |
|
| 402 | 403 |
} |
| 403 | 404 |
log.WithField("response", fmt.Sprintf("%+v", rsp)).Info("get table done")
|
| 404 | 405 |
diagnose.HTTPReply(w, diagnose.CommandSucceed(rsp), json) |
| 405 | 406 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,120 @@ |
| 0 |
+package networkdb |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "fmt" |
|
| 4 |
+ |
|
| 5 |
+ "github.com/hashicorp/memberlist" |
|
| 6 |
+ "github.com/sirupsen/logrus" |
|
| 7 |
+) |
|
| 8 |
+ |
|
| 9 |
+type nodeState int |
|
| 10 |
+ |
|
| 11 |
+const ( |
|
| 12 |
+ nodeNotFound nodeState = -1 |
|
| 13 |
+ nodeActiveState nodeState = 0 |
|
| 14 |
+ nodeLeftState nodeState = 1 |
|
| 15 |
+ nodeFailedState nodeState = 2 |
|
| 16 |
+) |
|
| 17 |
+ |
|
| 18 |
+var nodeStateName = map[nodeState]string{
|
|
| 19 |
+ -1: "NodeNotFound", |
|
| 20 |
+ 0: "NodeActive", |
|
| 21 |
+ 1: "NodeLeft", |
|
| 22 |
+ 2: "NodeFailed", |
|
| 23 |
+} |
|
| 24 |
+ |
|
| 25 |
+// findNode search the node into the 3 node lists and returns the node pointer and the list |
|
| 26 |
+// where it got found |
|
| 27 |
+func (nDB *NetworkDB) findNode(nodeName string) (*node, nodeState, map[string]*node) {
|
|
| 28 |
+ for i, nodes := range []map[string]*node{
|
|
| 29 |
+ nDB.nodes, |
|
| 30 |
+ nDB.leftNodes, |
|
| 31 |
+ nDB.failedNodes, |
|
| 32 |
+ } {
|
|
| 33 |
+ if n, ok := nodes[nodeName]; ok {
|
|
| 34 |
+ return n, nodeState(i), nodes |
|
| 35 |
+ } |
|
| 36 |
+ } |
|
| 37 |
+ return nil, nodeNotFound, nil |
|
| 38 |
+} |
|
| 39 |
+ |
|
| 40 |
+// changeNodeState changes the state of the node specified, returns true if the node was moved, |
|
| 41 |
+// false if there was no need to change the node state. Error will be returned if the node does not |
|
| 42 |
+// exists |
|
| 43 |
+func (nDB *NetworkDB) changeNodeState(nodeName string, newState nodeState) (bool, error) {
|
|
| 44 |
+ n, currState, m := nDB.findNode(nodeName) |
|
| 45 |
+ if n == nil {
|
|
| 46 |
+ return false, fmt.Errorf("node %s not found", nodeName)
|
|
| 47 |
+ } |
|
| 48 |
+ |
|
| 49 |
+ switch newState {
|
|
| 50 |
+ case nodeActiveState: |
|
| 51 |
+ if currState == nodeActiveState {
|
|
| 52 |
+ return false, nil |
|
| 53 |
+ } |
|
| 54 |
+ |
|
| 55 |
+ delete(m, nodeName) |
|
| 56 |
+ // reset the node reap time |
|
| 57 |
+ n.reapTime = 0 |
|
| 58 |
+ nDB.nodes[nodeName] = n |
|
| 59 |
+ case nodeLeftState: |
|
| 60 |
+ if currState == nodeLeftState {
|
|
| 61 |
+ return false, nil |
|
| 62 |
+ } |
|
| 63 |
+ |
|
| 64 |
+ delete(m, nodeName) |
|
| 65 |
+ nDB.leftNodes[nodeName] = n |
|
| 66 |
+ case nodeFailedState: |
|
| 67 |
+ if currState == nodeFailedState {
|
|
| 68 |
+ return false, nil |
|
| 69 |
+ } |
|
| 70 |
+ |
|
| 71 |
+ delete(m, nodeName) |
|
| 72 |
+ nDB.failedNodes[nodeName] = n |
|
| 73 |
+ } |
|
| 74 |
+ |
|
| 75 |
+ logrus.Infof("Node %s change state %s --> %s", nodeName, nodeStateName[currState], nodeStateName[newState])
|
|
| 76 |
+ |
|
| 77 |
+ if newState == nodeLeftState || newState == nodeFailedState {
|
|
| 78 |
+ // set the node reap time, if not already set |
|
| 79 |
+ // It is possible that a node passes from failed to left and the reaptime was already set so keep that value |
|
| 80 |
+ if n.reapTime == 0 {
|
|
| 81 |
+ n.reapTime = nodeReapInterval |
|
| 82 |
+ } |
|
| 83 |
+ // The node leave or fails, delete all the entries created by it. |
|
| 84 |
+ // If the node was temporary down, deleting the entries will guarantee that the CREATE events will be accepted |
|
| 85 |
+ // If the node instead left because was going down, then it makes sense to just delete all its state |
|
| 86 |
+ nDB.deleteNodeFromNetworks(n.Name) |
|
| 87 |
+ nDB.deleteNodeTableEntries(n.Name) |
|
| 88 |
+ } |
|
| 89 |
+ |
|
| 90 |
+ return true, nil |
|
| 91 |
+} |
|
| 92 |
+ |
|
| 93 |
+func (nDB *NetworkDB) purgeReincarnation(mn *memberlist.Node) bool {
|
|
| 94 |
+ for name, node := range nDB.nodes {
|
|
| 95 |
+ if node.Addr.Equal(mn.Addr) && node.Port == mn.Port && mn.Name != name {
|
|
| 96 |
+ logrus.Infof("Node %s/%s, is the new incarnation of the active node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
| 97 |
+ nDB.changeNodeState(name, nodeLeftState) |
|
| 98 |
+ return true |
|
| 99 |
+ } |
|
| 100 |
+ } |
|
| 101 |
+ |
|
| 102 |
+ for name, node := range nDB.failedNodes {
|
|
| 103 |
+ if node.Addr.Equal(mn.Addr) && node.Port == mn.Port && mn.Name != name {
|
|
| 104 |
+ logrus.Infof("Node %s/%s, is the new incarnation of the failed node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
| 105 |
+ nDB.changeNodeState(name, nodeLeftState) |
|
| 106 |
+ return true |
|
| 107 |
+ } |
|
| 108 |
+ } |
|
| 109 |
+ |
|
| 110 |
+ for name, node := range nDB.leftNodes {
|
|
| 111 |
+ if node.Addr.Equal(mn.Addr) && node.Port == mn.Port && mn.Name != name {
|
|
| 112 |
+ logrus.Infof("Node %s/%s, is the new incarnation of the shutdown node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
| 113 |
+ nDB.changeNodeState(name, nodeLeftState) |
|
| 114 |
+ return true |
|
| 115 |
+ } |
|
| 116 |
+ } |
|
| 117 |
+ |
|
| 118 |
+ return false |
|
| 119 |
+} |
| ... | ... |
@@ -74,7 +74,7 @@ func leaveNetwork(ip, port, network string, doneCh chan resultTuple) {
|
| 74 | 74 |
} |
| 75 | 75 |
|
| 76 | 76 |
func writeTableKey(ip, port, networkName, tableName, key string) {
|
| 77 |
- createPath := "/createentry?nid=" + networkName + "&tname=" + tableName + "&value=v&key=" |
|
| 77 |
+ createPath := "/createentry?unsafe&nid=" + networkName + "&tname=" + tableName + "&value=v&key=" |
|
| 78 | 78 |
httpGetFatalError(ip, port, createPath+key) |
| 79 | 79 |
} |
| 80 | 80 |
|
| ... | ... |
@@ -91,7 +91,7 @@ func clusterPeersNumber(ip, port string, doneCh chan resultTuple) {
|
| 91 | 91 |
doneCh <- resultTuple{id: ip, result: -1}
|
| 92 | 92 |
return |
| 93 | 93 |
} |
| 94 |
- peersRegexp := regexp.MustCompile(`Total peers: ([0-9]+)`) |
|
| 94 |
+ peersRegexp := regexp.MustCompile(`total entries: ([0-9]+)`) |
|
| 95 | 95 |
peersNum, _ := strconv.Atoi(peersRegexp.FindStringSubmatch(string(body))[1]) |
| 96 | 96 |
|
| 97 | 97 |
doneCh <- resultTuple{id: ip, result: peersNum}
|
| ... | ... |
@@ -105,7 +105,7 @@ func networkPeersNumber(ip, port, networkName string, doneCh chan resultTuple) {
|
| 105 | 105 |
doneCh <- resultTuple{id: ip, result: -1}
|
| 106 | 106 |
return |
| 107 | 107 |
} |
| 108 |
- peersRegexp := regexp.MustCompile(`Total peers: ([0-9]+)`) |
|
| 108 |
+ peersRegexp := regexp.MustCompile(`total entries: ([0-9]+)`) |
|
| 109 | 109 |
peersNum, _ := strconv.Atoi(peersRegexp.FindStringSubmatch(string(body))[1]) |
| 110 | 110 |
|
| 111 | 111 |
doneCh <- resultTuple{id: ip, result: peersNum}
|
| ... | ... |
@@ -119,7 +119,7 @@ func dbTableEntriesNumber(ip, port, networkName, tableName string, doneCh chan r |
| 119 | 119 |
doneCh <- resultTuple{id: ip, result: -1}
|
| 120 | 120 |
return |
| 121 | 121 |
} |
| 122 |
- elementsRegexp := regexp.MustCompile(`total elements: ([0-9]+)`) |
|
| 122 |
+ elementsRegexp := regexp.MustCompile(`total entries: ([0-9]+)`) |
|
| 123 | 123 |
entriesNum, _ := strconv.Atoi(elementsRegexp.FindStringSubmatch(string(body))[1]) |
| 124 | 124 |
doneCh <- resultTuple{id: ip, result: entriesNum}
|
| 125 | 125 |
} |
| ... | ... |
@@ -66,6 +66,8 @@ func Server(args []string) {
|
| 66 | 66 |
server.RegisterHandler(nil, testerPaths2Func) |
| 67 | 67 |
server.RegisterHandler(nDB, dummyclient.DummyClientPaths2Func) |
| 68 | 68 |
server.EnableDebug("", port)
|
| 69 |
+ // block here |
|
| 70 |
+ select {}
|
|
| 69 | 71 |
} |
| 70 | 72 |
|
| 71 | 73 |
func getIPInterface(name string) (string, error) {
|