Browse code

Vendor swarmkit 8f053c2

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>

Aaron Lehmann authored on 2017/04/27 03:01:01
Showing 29 changed files
... ...
@@ -41,7 +41,7 @@ github.com/vishvananda/netlink 1e86b2bee5b6a7d377e4c02bb7f98209d6a7297c
41 41
 github.com/BurntSushi/toml f706d00e3de6abe700c994cdd545a1a4915af060
42 42
 github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374
43 43
 github.com/deckarep/golang-set ef32fa3046d9f249d399f98ebaf9be944430fd1d
44
-github.com/coreos/etcd 824277cb3a577a0e8c829ca9ec557b973fe06d20
44
+github.com/coreos/etcd ea5389a79f40206170582c1ea076191b8622cb8e https://github.com/aaronlehmann/etcd # for https://github.com/coreos/etcd/pull/7830
45 45
 github.com/ugorji/go f1f1a805ed361a0e078bb537e4ea78cd37dcf065
46 46
 github.com/hashicorp/consul v0.5.2
47 47
 github.com/boltdb/bolt fff57c100f4dea1905678da7e90d92429dff2904
... ...
@@ -108,7 +108,7 @@ github.com/docker/containerd 9048e5e50717ea4497b757314bad98ea3763c145
108 108
 github.com/tonistiigi/fifo 1405643975692217d6720f8b54aeee1bf2cd5cf4
109 109
 
110 110
 # cluster
111
-github.com/docker/swarmkit 61a92e8ec074df5769decda985df4a3ab43c77eb
111
+github.com/docker/swarmkit 8f053c2030ebfc90f19f241fb7880e95b9761b7a
112 112
 github.com/gogo/protobuf 8d70fb3182befc465c4a1eac8ad4d38ff49778e2
113 113
 github.com/cloudflare/cfssl 7fb22c8cba7ecaf98e4082d22d65800cf45e042a
114 114
 github.com/google/certificate-transparency d90e65c3a07988180c5b1ece71791c0b6506826e
... ...
@@ -1154,6 +1154,10 @@ func (r *raft) addNode(id uint64) {
1154 1154
 	}
1155 1155
 
1156 1156
 	r.setProgress(id, 0, r.raftLog.lastIndex()+1)
1157
+	// When a node is first added, we should mark it as recently active.
1158
+	// Otherwise, CheckQuorum may cause us to step down if it is invoked
1159
+	// before the added node has a chance to communicate with us.
1160
+	r.prs[id].RecentActive = true
1157 1161
 }
1158 1162
 
1159 1163
 func (r *raft) removeNode(id uint64) {
... ...
@@ -1,6 +1,6 @@
1 1
 # [SwarmKit](https://github.com/docker/swarmkit)
2 2
 
3
-[![GoDoc](https://godoc.org/github.com/docker/swarmkit?status.png)](https://godoc.org/github.com/docker/swarmkit)
3
+[![GoDoc](https://godoc.org/github.com/docker/swarmkit?status.svg)](https://godoc.org/github.com/docker/swarmkit)
4 4
 [![Circle CI](https://circleci.com/gh/docker/swarmkit.svg?style=shield&circle-token=a7bf494e28963703a59de71cf19b73ad546058a7)](https://circleci.com/gh/docker/swarmkit)
5 5
 [![codecov.io](https://codecov.io/github/docker/swarmkit/coverage.svg?branch=master&token=LqD1dzTjsN)](https://codecov.io/github/docker/swarmkit?branch=master)
6 6
 [![Badge Badge](http://doyouevenbadge.com/github.com/docker/swarmkit)](http://doyouevenbadge.com/report/github.com/docker/swarmkit)
... ...
@@ -83,7 +83,7 @@ Requirements:
83 83
 
84 84
 -   Go 1.6 or higher
85 85
 -   A [working golang](https://golang.org/doc/code.html) environment
86
--   [Protobuf 3.x or higher] (https://developers.google.com/protocol-buffers/docs/downloads) to regenerate protocol buffer files (e.g. using `make generate`)
86
+-   [Protobuf 3.x or higher](https://developers.google.com/protocol-buffers/docs/downloads) to regenerate protocol buffer files (e.g. using `make generate`)
87 87
 
88 88
 *SwarmKit* is built in Go and leverages a standard project structure to work well with Go tooling.
89 89
 If you are new to Go, please see [BUILDING.md](BUILDING.md) for a more detailed guide.
... ...
@@ -426,14 +426,19 @@ func (w *worker) Listen(ctx context.Context, reporter StatusReporter) {
426 426
 }
427 427
 
428 428
 func (w *worker) startTask(ctx context.Context, tx *bolt.Tx, task *api.Task) error {
429
-	w.taskevents.Publish(task.Copy())
430 429
 	_, err := w.taskManager(ctx, tx, task) // side-effect taskManager creation.
431 430
 
432 431
 	if err != nil {
433 432
 		log.G(ctx).WithError(err).Error("failed to start taskManager")
433
+		// we ignore this error: it gets reported in the taskStatus within
434
+		// `newTaskManager`. We log it here and move on. If their is an
435
+		// attempted restart, the lack of taskManager will have this retry
436
+		// again.
437
+		return nil
434 438
 	}
435 439
 
436
-	// TODO(stevvooe): Add start method for taskmanager
440
+	// only publish if controller resolution was successful.
441
+	w.taskevents.Publish(task.Copy())
437 442
 	return nil
438 443
 }
439 444
 
... ...
@@ -464,7 +469,7 @@ func (w *worker) newTaskManager(ctx context.Context, tx *bolt.Tx, task *api.Task
464 464
 	}
465 465
 
466 466
 	if err != nil {
467
-		log.G(ctx).Error("controller resolution failed")
467
+		log.G(ctx).WithError(err).Error("controller resolution failed")
468 468
 		return nil, err
469 469
 	}
470 470
 
... ...
@@ -568,9 +573,14 @@ func (w *worker) Subscribe(ctx context.Context, subscription *api.SubscriptionMe
568 568
 		case v := <-ch:
569 569
 			task := v.(*api.Task)
570 570
 			if match(task) {
571
-				w.mu.Lock()
572
-				go w.taskManagers[task.ID].Logs(ctx, *subscription.Options, publisher)
573
-				w.mu.Unlock()
571
+				w.mu.RLock()
572
+				tm, ok := w.taskManagers[task.ID]
573
+				w.mu.RUnlock()
574
+				if !ok {
575
+					continue
576
+				}
577
+
578
+				go tm.Logs(ctx, *subscription.Options, publisher)
574 579
 			}
575 580
 		case <-ctx.Done():
576 581
 			return ctx.Err()
... ...
@@ -14,7 +14,6 @@ import (
14 14
 
15 15
 	"github.com/Sirupsen/logrus"
16 16
 	cfconfig "github.com/cloudflare/cfssl/config"
17
-	events "github.com/docker/go-events"
18 17
 	"github.com/docker/swarmkit/api"
19 18
 	"github.com/docker/swarmkit/connectionbroker"
20 19
 	"github.com/docker/swarmkit/identity"
... ...
@@ -51,13 +50,6 @@ const (
51 51
 	base36DigestLen = 50
52 52
 )
53 53
 
54
-// RenewTLSExponentialBackoff sets the exponential backoff when trying to renew TLS certificates that have expired
55
-var RenewTLSExponentialBackoff = events.ExponentialBackoffConfig{
56
-	Base:   time.Second * 5,
57
-	Factor: time.Second * 5,
58
-	Max:    1 * time.Hour,
59
-}
60
-
61 54
 // SecurityConfig is used to represent a node's security configuration. It includes information about
62 55
 // the RootCA and ServerTLSCreds/ClientTLSCreds transport authenticators to be used for MTLS
63 56
 type SecurityConfig struct {
... ...
@@ -468,96 +460,6 @@ func RenewTLSConfigNow(ctx context.Context, s *SecurityConfig, connBroker *conne
468 468
 	return s.updateTLSCredentials(tlsKeyPair, issuerInfo)
469 469
 }
470 470
 
471
-// RenewTLSConfig will continuously monitor for the necessity of renewing the local certificates, either by
472
-// issuing them locally if key-material is available, or requesting them from a remote CA.
473
-func RenewTLSConfig(ctx context.Context, s *SecurityConfig, connBroker *connectionbroker.Broker, renew <-chan struct{}) <-chan CertificateUpdate {
474
-	updates := make(chan CertificateUpdate)
475
-
476
-	go func() {
477
-		var (
478
-			retry      time.Duration
479
-			forceRetry bool
480
-		)
481
-		expBackoff := events.NewExponentialBackoff(RenewTLSExponentialBackoff)
482
-		defer close(updates)
483
-		for {
484
-			ctx = log.WithModule(ctx, "tls")
485
-			log := log.G(ctx).WithFields(logrus.Fields{
486
-				"node.id":   s.ClientTLSCreds.NodeID(),
487
-				"node.role": s.ClientTLSCreds.Role(),
488
-			})
489
-			// Our starting default will be 5 minutes
490
-			retry = 5 * time.Minute
491
-
492
-			// Since the expiration of the certificate is managed remotely we should update our
493
-			// retry timer on every iteration of this loop.
494
-			// Retrieve the current certificate expiration information.
495
-			validFrom, validUntil, err := readCertValidity(s.KeyReader())
496
-			if err != nil {
497
-				// We failed to read the expiration, let's stick with the starting default
498
-				log.Errorf("failed to read the expiration of the TLS certificate in: %s", s.KeyReader().Target())
499
-
500
-				select {
501
-				case updates <- CertificateUpdate{Err: errors.New("failed to read certificate expiration")}:
502
-				case <-ctx.Done():
503
-					log.Info("shutting down certificate renewal routine")
504
-					return
505
-				}
506
-			} else {
507
-				// If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or
508
-				// we can issue our own TLS certs.
509
-				if validUntil.Before(time.Now()) {
510
-					log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately")
511
-					// retry immediately(ish) with exponential backoff
512
-					retry = expBackoff.Proceed(nil)
513
-				} else if forceRetry {
514
-					// A forced renewal was requested, but did not succeed yet.
515
-					// retry immediately(ish) with exponential backoff
516
-					retry = expBackoff.Proceed(nil)
517
-				} else {
518
-					// Random retry time between 50% and 80% of the total time to expiration
519
-					retry = calculateRandomExpiry(validFrom, validUntil)
520
-				}
521
-			}
522
-
523
-			log.WithFields(logrus.Fields{
524
-				"time": time.Now().Add(retry),
525
-			}).Debugf("next certificate renewal scheduled for %v from now", retry)
526
-
527
-			select {
528
-			case <-time.After(retry):
529
-				log.Info("renewing certificate")
530
-			case <-renew:
531
-				forceRetry = true
532
-				log.Info("forced certificate renewal")
533
-			case <-ctx.Done():
534
-				log.Info("shutting down certificate renewal routine")
535
-				return
536
-			}
537
-
538
-			// ignore errors - it will just try again later
539
-			var certUpdate CertificateUpdate
540
-			if err := RenewTLSConfigNow(ctx, s, connBroker); err != nil {
541
-				certUpdate.Err = err
542
-				expBackoff.Failure(nil, nil)
543
-			} else {
544
-				certUpdate.Role = s.ClientTLSCreds.Role()
545
-				expBackoff = events.NewExponentialBackoff(RenewTLSExponentialBackoff)
546
-				forceRetry = false
547
-			}
548
-
549
-			select {
550
-			case updates <- certUpdate:
551
-			case <-ctx.Done():
552
-				log.Info("shutting down certificate renewal routine")
553
-				return
554
-			}
555
-		}
556
-	}()
557
-
558
-	return updates
559
-}
560
-
561 471
 // calculateRandomExpiry returns a random duration between 50% and 80% of the
562 472
 // original validity period
563 473
 func calculateRandomExpiry(validFrom, validUntil time.Time) time.Duration {
... ...
@@ -241,7 +241,7 @@ func (r *rootRotationReconciler) batchUpdateNodes(toUpdate []*api.Node) error {
241 241
 	if len(toUpdate) == 0 {
242 242
 		return nil
243 243
 	}
244
-	_, err := r.store.Batch(func(batch *store.Batch) error {
244
+	err := r.store.Batch(func(batch *store.Batch) error {
245 245
 		// Directly update the nodes rather than get + update, and ignore version errors.  Since
246 246
 		// `rootRotationReconciler` should be hooked up to all node update/delete/create events, we should have
247 247
 		// close to the latest versions of all the nodes.  If not, the node will updated later and the
248 248
new file mode 100644
... ...
@@ -0,0 +1,166 @@
0
+package ca
1
+
2
+import (
3
+	"sync"
4
+	"time"
5
+
6
+	"github.com/Sirupsen/logrus"
7
+	"github.com/docker/go-events"
8
+	"github.com/docker/swarmkit/connectionbroker"
9
+	"github.com/docker/swarmkit/log"
10
+	"github.com/pkg/errors"
11
+	"golang.org/x/net/context"
12
+)
13
+
14
+// RenewTLSExponentialBackoff sets the exponential backoff when trying to renew TLS certificates that have expired
15
+var RenewTLSExponentialBackoff = events.ExponentialBackoffConfig{
16
+	Base:   time.Second * 5,
17
+	Factor: time.Second * 5,
18
+	Max:    1 * time.Hour,
19
+}
20
+
21
+// TLSRenewer handles renewing TLS certificates, either automatically or upon
22
+// request.
23
+type TLSRenewer struct {
24
+	mu           sync.Mutex
25
+	s            *SecurityConfig
26
+	connBroker   *connectionbroker.Broker
27
+	renew        chan struct{}
28
+	expectedRole string
29
+}
30
+
31
+// NewTLSRenewer creates a new TLS renewer. It must be started with Start.
32
+func NewTLSRenewer(s *SecurityConfig, connBroker *connectionbroker.Broker) *TLSRenewer {
33
+	return &TLSRenewer{
34
+		s:          s,
35
+		connBroker: connBroker,
36
+		renew:      make(chan struct{}, 1),
37
+	}
38
+}
39
+
40
+// SetExpectedRole sets the expected role. If a renewal is forced, and the role
41
+// doesn't match this expectation, renewal will be retried with exponential
42
+// backoff until it does match.
43
+func (t *TLSRenewer) SetExpectedRole(role string) {
44
+	t.mu.Lock()
45
+	t.expectedRole = role
46
+	t.mu.Unlock()
47
+}
48
+
49
+// Renew causes the TLSRenewer to renew the certificate (nearly) right away,
50
+// instead of waiting for the next automatic renewal.
51
+func (t *TLSRenewer) Renew() {
52
+	select {
53
+	case t.renew <- struct{}{}:
54
+	default:
55
+	}
56
+}
57
+
58
+// Start will continuously monitor for the necessity of renewing the local certificates, either by
59
+// issuing them locally if key-material is available, or requesting them from a remote CA.
60
+func (t *TLSRenewer) Start(ctx context.Context) <-chan CertificateUpdate {
61
+	updates := make(chan CertificateUpdate)
62
+
63
+	go func() {
64
+		var (
65
+			retry      time.Duration
66
+			forceRetry bool
67
+		)
68
+		expBackoff := events.NewExponentialBackoff(RenewTLSExponentialBackoff)
69
+		defer close(updates)
70
+		for {
71
+			ctx = log.WithModule(ctx, "tls")
72
+			log := log.G(ctx).WithFields(logrus.Fields{
73
+				"node.id":   t.s.ClientTLSCreds.NodeID(),
74
+				"node.role": t.s.ClientTLSCreds.Role(),
75
+			})
76
+			// Our starting default will be 5 minutes
77
+			retry = 5 * time.Minute
78
+
79
+			// Since the expiration of the certificate is managed remotely we should update our
80
+			// retry timer on every iteration of this loop.
81
+			// Retrieve the current certificate expiration information.
82
+			validFrom, validUntil, err := readCertValidity(t.s.KeyReader())
83
+			if err != nil {
84
+				// We failed to read the expiration, let's stick with the starting default
85
+				log.Errorf("failed to read the expiration of the TLS certificate in: %s", t.s.KeyReader().Target())
86
+
87
+				select {
88
+				case updates <- CertificateUpdate{Err: errors.New("failed to read certificate expiration")}:
89
+				case <-ctx.Done():
90
+					log.Info("shutting down certificate renewal routine")
91
+					return
92
+				}
93
+			} else {
94
+				// If we have an expired certificate, try to renew immediately: the hope that this is a temporary clock skew, or
95
+				// we can issue our own TLS certs.
96
+				if validUntil.Before(time.Now()) {
97
+					log.Warn("the current TLS certificate is expired, so an attempt to renew it will be made immediately")
98
+					// retry immediately(ish) with exponential backoff
99
+					retry = expBackoff.Proceed(nil)
100
+				} else if forceRetry {
101
+					// A forced renewal was requested, but did not succeed yet.
102
+					// retry immediately(ish) with exponential backoff
103
+					retry = expBackoff.Proceed(nil)
104
+				} else {
105
+					// Random retry time between 50% and 80% of the total time to expiration
106
+					retry = calculateRandomExpiry(validFrom, validUntil)
107
+				}
108
+			}
109
+
110
+			log.WithFields(logrus.Fields{
111
+				"time": time.Now().Add(retry),
112
+			}).Debugf("next certificate renewal scheduled for %v from now", retry)
113
+
114
+			select {
115
+			case <-time.After(retry):
116
+				log.Info("renewing certificate")
117
+			case <-t.renew:
118
+				forceRetry = true
119
+				log.Info("forced certificate renewal")
120
+
121
+				// Pause briefly before attempting the renewal,
122
+				// to give the CA a chance to reconcile the
123
+				// desired role.
124
+				select {
125
+				case <-time.After(500 * time.Millisecond):
126
+				case <-ctx.Done():
127
+					log.Info("shutting down certificate renewal routine")
128
+					return
129
+				}
130
+			case <-ctx.Done():
131
+				log.Info("shutting down certificate renewal routine")
132
+				return
133
+			}
134
+
135
+			// ignore errors - it will just try again later
136
+			var certUpdate CertificateUpdate
137
+			if err := RenewTLSConfigNow(ctx, t.s, t.connBroker); err != nil {
138
+				certUpdate.Err = err
139
+				expBackoff.Failure(nil, nil)
140
+			} else {
141
+				newRole := t.s.ClientTLSCreds.Role()
142
+				t.mu.Lock()
143
+				expectedRole := t.expectedRole
144
+				t.mu.Unlock()
145
+				if expectedRole != "" && expectedRole != newRole {
146
+					expBackoff.Failure(nil, nil)
147
+					continue
148
+				}
149
+
150
+				certUpdate.Role = newRole
151
+				expBackoff.Success(nil)
152
+				forceRetry = false
153
+			}
154
+
155
+			select {
156
+			case updates <- certUpdate:
157
+			case <-ctx.Done():
158
+				log.Info("shutting down certificate renewal routine")
159
+				return
160
+			}
161
+		}
162
+	}()
163
+
164
+	return updates
165
+}
... ...
@@ -580,6 +580,7 @@ func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster) error {
580 580
 
581 581
 	s.secConfigMu.Lock()
582 582
 	defer s.secConfigMu.Unlock()
583
+	firstSeenCluster := s.lastSeenClusterRootCA == nil && s.lastSeenExternalCAs == nil
583 584
 	rootCAChanged := len(rCA.CACert) != 0 && !equality.RootCAEqualStable(s.lastSeenClusterRootCA, rCA)
584 585
 	externalCAChanged := !equality.ExternalCAsEqualStable(s.lastSeenExternalCAs, cluster.Spec.CAConfig.ExternalCAs)
585 586
 	logger := log.G(ctx).WithFields(logrus.Fields{
... ...
@@ -588,7 +589,11 @@ func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster) error {
588 588
 	})
589 589
 
590 590
 	if rootCAChanged {
591
-		logger.Debug("Updating security config due to change in cluster Root CA")
591
+		setOrUpdate := "set"
592
+		if !firstSeenCluster {
593
+			logger.Debug("Updating security config due to change in cluster Root CA")
594
+			setOrUpdate = "updated"
595
+		}
592 596
 		expiry := DefaultNodeCertExpiration
593 597
 		if cluster.Spec.CAConfig.NodeCertExpiry != nil {
594 598
 			// NodeCertExpiry exists, let's try to parse the duration out of it
... ...
@@ -636,14 +641,16 @@ func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster) error {
636 636
 			return errors.Wrap(err, "updating Root CA failed")
637 637
 		}
638 638
 		// only update the server cache if we've successfully updated the root CA
639
-		logger.Debug("Root CA updated successfully")
639
+		logger.Debugf("Root CA %s successfully", setOrUpdate)
640 640
 		s.lastSeenClusterRootCA = rCA
641 641
 	}
642 642
 
643 643
 	// we want to update if the external CA changed, or if the root CA changed because the root CA could affect what
644 644
 	// certificate for external CAs we want to filter by
645 645
 	if rootCAChanged || externalCAChanged {
646
-		logger.Debug("Updating security config due to change in cluster Root CA or cluster spec")
646
+		if !firstSeenCluster {
647
+			logger.Debug("Updating security config external CA URLs due to change in cluster Root CA or cluster spec")
648
+		}
647 649
 		wantedExternalCACert := rCA.CACert // we want to only add external CA URLs that use this cert
648 650
 		if rCA.RootRotation != nil {
649 651
 			// we're rotating to a new root, so we only want external CAs with the new root cert
... ...
@@ -1,6 +1,6 @@
1
-// Package identity provides functionality for generating and manager
2
-// identifiers within swarm. This includes entity identification, such as that
3
-// of Service, Task and Network but also cryptographically-secure Node identity.
1
+// Package identity provides functionality for generating and managing
2
+// identifiers within a swarm. This includes entity identification, such as for
3
+// Services, Tasks and Networks but also cryptographically-secure Node identities.
4 4
 //
5 5
 // Random Identifiers
6 6
 //
... ...
@@ -8,10 +8,9 @@
8 8
 // 128 bit numbers encoded in Base36. This method is preferred over UUID4 since
9 9
 // it requires less storage and leverages the full 128 bits of entropy.
10 10
 //
11
-// Generating an identifier is simple. Simply call the `NewID` function, check
12
-// the error and proceed:
11
+// Generating an identifier is simple. Simply call the `NewID` function:
13 12
 //
14
-// 	id, err := NewID()
15
-// 	if err != nil { /* ... handle it, please ... */ }
13
+// 	id := NewID()
16 14
 //
15
+// If an error occurs while generating the ID, it will panic.
17 16
 package identity
... ...
@@ -3,16 +3,16 @@
3 3
 // manages a set of independent allocator processes which can mostly
4 4
 // execute concurrently with only a minimal need for coordination.
5 5
 //
6
-// One of the instances where it needs coordination is when to move a
7
-// task to ALLOCATED state. Since a task can move to ALLOCATED state
8
-// only when all task allocators have completed their service of
9
-// allocation, they all have to agree on that. The way this achieved
10
-// in `allocator` is by creating a `taskBallot` to which all task
11
-// allocators register themselves as mandatory voters. For each task
12
-// that needs allocation, each allocator independently votes to indicate
13
-// the completion of their allocation. Once all registered voters have
14
-// voted then the task is moved to ALLOCATED state.
6
+// One of the instances where it needs coordination is when deciding to
7
+// move a task to the PENDING state. Since a task can move to the
8
+// PENDING state only when all the task allocators have completed,
9
+// they must cooperate. The way `allocator` achieves this is by creating
10
+// a `taskBallot` to which all task allocators register themselves as
11
+// mandatory voters. For each task that needs allocation, each allocator
12
+// independently votes to indicate the completion of their allocation.
13
+// Once all registered voters have voted then the task is moved to the
14
+// PENDING state.
15 15
 //
16
-// Other than the coordination needed for task ALLOCATED state, all
16
+// Other than the coordination needed for task PENDING state, all
17 17
 // the allocators function fairly independently.
18 18
 package allocator
... ...
@@ -95,7 +95,7 @@ func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
95 95
 		if !na.IsAllocated(nc.ingressNetwork) {
96 96
 			if err := a.allocateNetwork(ctx, nc.ingressNetwork); err != nil {
97 97
 				log.G(ctx).WithError(err).Error("failed allocating ingress network during init")
98
-			} else if _, err := a.store.Batch(func(batch *store.Batch) error {
98
+			} else if err := a.store.Batch(func(batch *store.Batch) error {
99 99
 				if err := a.commitAllocatedNetwork(ctx, batch, nc.ingressNetwork); err != nil {
100 100
 					log.G(ctx).WithError(err).Error("failed committing allocation of ingress network during init")
101 101
 				}
... ...
@@ -134,7 +134,7 @@ func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
134 134
 		allocatedNetworks = append(allocatedNetworks, n)
135 135
 	}
136 136
 
137
-	if _, err := a.store.Batch(func(batch *store.Batch) error {
137
+	if err := a.store.Batch(func(batch *store.Batch) error {
138 138
 		for _, n := range allocatedNetworks {
139 139
 			if err := a.commitAllocatedNetwork(ctx, batch, n); err != nil {
140 140
 				log.G(ctx).WithError(err).Errorf("failed committing allocation of network %s during init", n.ID)
... ...
@@ -164,7 +164,7 @@ func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
164 164
 
165 165
 	var allocatedServices []*api.Service
166 166
 	for _, s := range services {
167
-		if nc.nwkAllocator.IsServiceAllocated(s, networkallocator.OnInit) {
167
+		if !nc.nwkAllocator.ServiceNeedsAllocation(s, networkallocator.OnInit) {
168 168
 			continue
169 169
 		}
170 170
 
... ...
@@ -175,7 +175,7 @@ func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
175 175
 		allocatedServices = append(allocatedServices, s)
176 176
 	}
177 177
 
178
-	if _, err := a.store.Batch(func(batch *store.Batch) error {
178
+	if err := a.store.Batch(func(batch *store.Batch) error {
179 179
 		for _, s := range allocatedServices {
180 180
 			if err := a.commitAllocatedService(ctx, batch, s); err != nil {
181 181
 				log.G(ctx).WithError(err).Errorf("failed committing allocation of service %s during init", s.ID)
... ...
@@ -239,7 +239,7 @@ func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
239 239
 		}
240 240
 	}
241 241
 
242
-	if _, err := a.store.Batch(func(batch *store.Batch) error {
242
+	if err := a.store.Batch(func(batch *store.Batch) error {
243 243
 		for _, t := range allocatedTasks {
244 244
 			if err := a.commitAllocatedTask(ctx, batch, t); err != nil {
245 245
 				log.G(ctx).WithError(err).Errorf("failed committing allocation of task %s during init", t.ID)
... ...
@@ -275,7 +275,7 @@ func (a *Allocator) doNetworkAlloc(ctx context.Context, ev events.Event) {
275 275
 			break
276 276
 		}
277 277
 
278
-		if _, err := a.store.Batch(func(batch *store.Batch) error {
278
+		if err := a.store.Batch(func(batch *store.Batch) error {
279 279
 			return a.commitAllocatedNetwork(ctx, batch, n)
280 280
 		}); err != nil {
281 281
 			log.G(ctx).WithError(err).Errorf("Failed to commit allocation for network %s", n.ID)
... ...
@@ -317,7 +317,7 @@ func (a *Allocator) doNetworkAlloc(ctx context.Context, ev events.Event) {
317 317
 			break
318 318
 		}
319 319
 
320
-		if nc.nwkAllocator.IsServiceAllocated(s) {
320
+		if !nc.nwkAllocator.ServiceNeedsAllocation(s) {
321 321
 			break
322 322
 		}
323 323
 
... ...
@@ -326,7 +326,7 @@ func (a *Allocator) doNetworkAlloc(ctx context.Context, ev events.Event) {
326 326
 			break
327 327
 		}
328 328
 
329
-		if _, err := a.store.Batch(func(batch *store.Batch) error {
329
+		if err := a.store.Batch(func(batch *store.Batch) error {
330 330
 			return a.commitAllocatedService(ctx, batch, s)
331 331
 		}); err != nil {
332 332
 			log.G(ctx).WithError(err).Errorf("Failed to commit allocation for service %s", s.ID)
... ...
@@ -345,8 +345,8 @@ func (a *Allocator) doNetworkAlloc(ctx context.Context, ev events.Event) {
345 345
 			break
346 346
 		}
347 347
 
348
-		if nc.nwkAllocator.IsServiceAllocated(s) {
349
-			if nc.nwkAllocator.PortsAllocatedInHostPublishMode(s) {
348
+		if !nc.nwkAllocator.ServiceNeedsAllocation(s) {
349
+			if !nc.nwkAllocator.HostPublishPortsNeedUpdate(s) {
350 350
 				break
351 351
 			}
352 352
 			updatePortsInHostPublishMode(s)
... ...
@@ -357,7 +357,7 @@ func (a *Allocator) doNetworkAlloc(ctx context.Context, ev events.Event) {
357 357
 			}
358 358
 		}
359 359
 
360
-		if _, err := a.store.Batch(func(batch *store.Batch) error {
360
+		if err := a.store.Batch(func(batch *store.Batch) error {
361 361
 			return a.commitAllocatedService(ctx, batch, s)
362 362
 		}); err != nil {
363 363
 			log.G(ctx).WithError(err).Errorf("Failed to commit allocation during update for service %s", s.ID)
... ...
@@ -447,7 +447,7 @@ func (a *Allocator) doNodeAlloc(ctx context.Context, ev events.Event) {
447 447
 			return
448 448
 		}
449 449
 
450
-		if _, err := a.store.Batch(func(batch *store.Batch) error {
450
+		if err := a.store.Batch(func(batch *store.Batch) error {
451 451
 			return a.commitAllocatedNode(ctx, batch, node)
452 452
 		}); err != nil {
453 453
 			log.G(ctx).WithError(err).Errorf("Failed to commit allocation of network resources for node %s", node.ID)
... ...
@@ -489,7 +489,7 @@ func (a *Allocator) allocateNodes(ctx context.Context) error {
489 489
 		allocatedNodes = append(allocatedNodes, node)
490 490
 	}
491 491
 
492
-	if _, err := a.store.Batch(func(batch *store.Batch) error {
492
+	if err := a.store.Batch(func(batch *store.Batch) error {
493 493
 		for _, node := range allocatedNodes {
494 494
 			if err := a.commitAllocatedNode(ctx, batch, node); err != nil {
495 495
 				log.G(ctx).WithError(err).Errorf("Failed to commit allocation of network resources for node %s", node.ID)
... ...
@@ -523,7 +523,7 @@ func (a *Allocator) deallocateNodes(ctx context.Context) error {
523 523
 				log.G(ctx).WithError(err).Errorf("Failed freeing network resources for node %s", node.ID)
524 524
 			}
525 525
 			node.Attachment = nil
526
-			if _, err := a.store.Batch(func(batch *store.Batch) error {
526
+			if err := a.store.Batch(func(batch *store.Batch) error {
527 527
 				return a.commitAllocatedNode(ctx, batch, node)
528 528
 			}); err != nil {
529 529
 				log.G(ctx).WithError(err).Errorf("Failed to commit deallocation of network resources for node %s", node.ID)
... ...
@@ -544,7 +544,7 @@ func taskReadyForNetworkVote(t *api.Task, s *api.Service, nc *networkContext) bo
544 544
 	// network configured or service endpoints have been
545 545
 	// allocated.
546 546
 	return (len(t.Networks) == 0 || nc.nwkAllocator.IsTaskAllocated(t)) &&
547
-		(s == nil || nc.nwkAllocator.IsServiceAllocated(s))
547
+		(s == nil || !nc.nwkAllocator.ServiceNeedsAllocation(s))
548 548
 }
549 549
 
550 550
 func taskUpdateNetworks(t *api.Task, networks []*api.NetworkAttachment) {
... ...
@@ -732,28 +732,29 @@ func (a *Allocator) commitAllocatedNode(ctx context.Context, batch *store.Batch,
732 732
 // so that the service allocation invoked on this new service object will trigger the deallocation
733 733
 // of any old publish mode port and allocation of any new one.
734 734
 func updatePortsInHostPublishMode(s *api.Service) {
735
+	// First, remove all host-mode ports from s.Endpoint.Ports
735 736
 	if s.Endpoint != nil {
736 737
 		var portConfigs []*api.PortConfig
737 738
 		for _, portConfig := range s.Endpoint.Ports {
738
-			if portConfig.PublishMode == api.PublishModeIngress {
739
+			if portConfig.PublishMode != api.PublishModeHost {
739 740
 				portConfigs = append(portConfigs, portConfig)
740 741
 			}
741 742
 		}
742 743
 		s.Endpoint.Ports = portConfigs
743 744
 	}
744 745
 
746
+	// Add back all host-mode ports
745 747
 	if s.Spec.Endpoint != nil {
746 748
 		if s.Endpoint == nil {
747 749
 			s.Endpoint = &api.Endpoint{}
748 750
 		}
749 751
 		for _, portConfig := range s.Spec.Endpoint.Ports {
750
-			if portConfig.PublishMode == api.PublishModeIngress {
751
-				continue
752
+			if portConfig.PublishMode == api.PublishModeHost {
753
+				s.Endpoint.Ports = append(s.Endpoint.Ports, portConfig.Copy())
752 754
 			}
753
-			s.Endpoint.Ports = append(s.Endpoint.Ports, portConfig.Copy())
754 755
 		}
755
-		s.Endpoint.Spec = s.Spec.Endpoint.Copy()
756 756
 	}
757
+	s.Endpoint.Spec = s.Spec.Endpoint.Copy()
757 758
 }
758 759
 
759 760
 func (a *Allocator) allocateService(ctx context.Context, s *api.Service) error {
... ...
@@ -886,7 +887,7 @@ func (a *Allocator) allocateTask(ctx context.Context, t *api.Task) (err error) {
886 886
 					return
887 887
 				}
888 888
 
889
-				if !nc.nwkAllocator.IsServiceAllocated(s) {
889
+				if nc.nwkAllocator.ServiceNeedsAllocation(s) {
890 890
 					err = fmt.Errorf("service %s to which this task %s belongs has pending allocations", s.ID, t.ID)
891 891
 					return
892 892
 				}
... ...
@@ -977,22 +978,25 @@ func (a *Allocator) procUnallocatedNetworks(ctx context.Context) {
977 977
 		return
978 978
 	}
979 979
 
980
-	committed, err := a.store.Batch(func(batch *store.Batch) error {
980
+	err := a.store.Batch(func(batch *store.Batch) error {
981 981
 		for _, n := range allocatedNetworks {
982 982
 			if err := a.commitAllocatedNetwork(ctx, batch, n); err != nil {
983 983
 				log.G(ctx).WithError(err).Debugf("Failed to commit allocation of unallocated network %s", n.ID)
984 984
 				continue
985 985
 			}
986
+			delete(nc.unallocatedNetworks, n.ID)
986 987
 		}
987 988
 		return nil
988 989
 	})
989 990
 
990 991
 	if err != nil {
991 992
 		log.G(ctx).WithError(err).Error("Failed to commit allocation of unallocated networks")
992
-	}
993
-
994
-	for _, n := range allocatedNetworks[:committed] {
995
-		delete(nc.unallocatedNetworks, n.ID)
993
+		// We optimistically removed these from nc.unallocatedNetworks
994
+		// above in anticipation of successfully committing the batch,
995
+		// but since the transaction has failed, we requeue them here.
996
+		for _, n := range allocatedNetworks {
997
+			nc.unallocatedNetworks[n.ID] = n
998
+		}
996 999
 	}
997 1000
 }
998 1001
 
... ...
@@ -1000,7 +1004,7 @@ func (a *Allocator) procUnallocatedServices(ctx context.Context) {
1000 1000
 	nc := a.netCtx
1001 1001
 	var allocatedServices []*api.Service
1002 1002
 	for _, s := range nc.unallocatedServices {
1003
-		if !nc.nwkAllocator.IsServiceAllocated(s) {
1003
+		if nc.nwkAllocator.ServiceNeedsAllocation(s) {
1004 1004
 			if err := a.allocateService(ctx, s); err != nil {
1005 1005
 				log.G(ctx).WithError(err).Debugf("Failed allocation of unallocated service %s", s.ID)
1006 1006
 				continue
... ...
@@ -1013,22 +1017,25 @@ func (a *Allocator) procUnallocatedServices(ctx context.Context) {
1013 1013
 		return
1014 1014
 	}
1015 1015
 
1016
-	committed, err := a.store.Batch(func(batch *store.Batch) error {
1016
+	err := a.store.Batch(func(batch *store.Batch) error {
1017 1017
 		for _, s := range allocatedServices {
1018 1018
 			if err := a.commitAllocatedService(ctx, batch, s); err != nil {
1019 1019
 				log.G(ctx).WithError(err).Debugf("Failed to commit allocation of unallocated service %s", s.ID)
1020 1020
 				continue
1021 1021
 			}
1022
+			delete(nc.unallocatedServices, s.ID)
1022 1023
 		}
1023 1024
 		return nil
1024 1025
 	})
1025 1026
 
1026 1027
 	if err != nil {
1027 1028
 		log.G(ctx).WithError(err).Error("Failed to commit allocation of unallocated services")
1028
-	}
1029
-
1030
-	for _, s := range allocatedServices[:committed] {
1031
-		delete(nc.unallocatedServices, s.ID)
1029
+		// We optimistically removed these from nc.unallocatedServices
1030
+		// above in anticipation of successfully committing the batch,
1031
+		// but since the transaction has failed, we requeue them here.
1032
+		for _, s := range allocatedServices {
1033
+			nc.unallocatedServices[s.ID] = s
1034
+		}
1032 1035
 	}
1033 1036
 }
1034 1037
 
... ...
@@ -1058,14 +1065,14 @@ func (a *Allocator) procTasksNetwork(ctx context.Context, onRetry bool) {
1058 1058
 		return
1059 1059
 	}
1060 1060
 
1061
-	committed, err := a.store.Batch(func(batch *store.Batch) error {
1061
+	err := a.store.Batch(func(batch *store.Batch) error {
1062 1062
 		for _, t := range allocatedTasks {
1063 1063
 			err := a.commitAllocatedTask(ctx, batch, t)
1064
-
1065 1064
 			if err != nil {
1066 1065
 				log.G(ctx).WithError(err).Error("task allocation commit failure")
1067 1066
 				continue
1068 1067
 			}
1068
+			delete(toAllocate, t.ID)
1069 1069
 		}
1070 1070
 
1071 1071
 		return nil
... ...
@@ -1073,10 +1080,12 @@ func (a *Allocator) procTasksNetwork(ctx context.Context, onRetry bool) {
1073 1073
 
1074 1074
 	if err != nil {
1075 1075
 		log.G(ctx).WithError(err).Error("failed a store batch operation while processing tasks")
1076
-	}
1077
-
1078
-	for _, t := range allocatedTasks[:committed] {
1079
-		delete(toAllocate, t.ID)
1076
+		// We optimistically removed these from toAllocate above in
1077
+		// anticipation of successfully committing the batch, but since
1078
+		// the transaction has failed, we requeue them here.
1079
+		for _, t := range allocatedTasks {
1080
+			toAllocate[t.ID] = t
1081
+		}
1080 1082
 	}
1081 1083
 }
1082 1084
 
... ...
@@ -1089,12 +1098,7 @@ func updateTaskStatus(t *api.Task, newStatus api.TaskState, message string) {
1089 1089
 
1090 1090
 // IsIngressNetwork returns whether the passed network is an ingress network.
1091 1091
 func IsIngressNetwork(nw *api.Network) bool {
1092
-	if nw.Spec.Ingress {
1093
-		return true
1094
-	}
1095
-	// Check if legacy defined ingress network
1096
-	_, ok := nw.Spec.Annotations.Labels["com.docker.swarm.internal"]
1097
-	return ok && nw.Spec.Annotations.Name == "ingress"
1092
+	return networkallocator.IsIngressNetwork(nw)
1098 1093
 }
1099 1094
 
1100 1095
 // GetIngressNetwork fetches the ingress network from store.
... ...
@@ -153,7 +153,7 @@ func (na *NetworkAllocator) Deallocate(n *api.Network) error {
153 153
 // IP and ports needed by the service.
154 154
 func (na *NetworkAllocator) ServiceAllocate(s *api.Service) (err error) {
155 155
 	if err = na.portAllocator.serviceAllocatePorts(s); err != nil {
156
-		return
156
+		return err
157 157
 	}
158 158
 	defer func() {
159 159
 		if err != nil {
... ...
@@ -169,54 +169,74 @@ func (na *NetworkAllocator) ServiceAllocate(s *api.Service) (err error) {
169 169
 	// If ResolutionMode is DNSRR do not try allocating VIPs, but
170 170
 	// free any VIP from previous state.
171 171
 	if s.Spec.Endpoint != nil && s.Spec.Endpoint.Mode == api.ResolutionModeDNSRoundRobin {
172
-		if s.Endpoint != nil {
173
-			for _, vip := range s.Endpoint.VirtualIPs {
174
-				if err := na.deallocateVIP(vip); err != nil {
175
-					// don't bail here, deallocate as many as possible.
176
-					log.L.WithError(err).
177
-						WithField("vip.network", vip.NetworkID).
178
-						WithField("vip.addr", vip.Addr).Error("error deallocating vip")
179
-				}
172
+		for _, vip := range s.Endpoint.VirtualIPs {
173
+			if err := na.deallocateVIP(vip); err != nil {
174
+				// don't bail here, deallocate as many as possible.
175
+				log.L.WithError(err).
176
+					WithField("vip.network", vip.NetworkID).
177
+					WithField("vip.addr", vip.Addr).Error("error deallocating vip")
180 178
 			}
181
-
182
-			s.Endpoint.VirtualIPs = nil
183 179
 		}
184 180
 
181
+		s.Endpoint.VirtualIPs = nil
182
+
185 183
 		delete(na.services, s.ID)
186
-		return
184
+		return nil
187 185
 	}
188 186
 
189
-	// First allocate VIPs for all the pre-populated endpoint attachments
187
+	specNetworks := serviceNetworks(s)
188
+
189
+	// Allocate VIPs for all the pre-populated endpoint attachments
190
+	eVIPs := s.Endpoint.VirtualIPs[:0]
191
+
192
+vipLoop:
190 193
 	for _, eAttach := range s.Endpoint.VirtualIPs {
191
-		if err = na.allocateVIP(eAttach); err != nil {
192
-			return
193
-		}
194
-	}
194
+		if na.IsVIPOnIngressNetwork(eAttach) {
195
+			if err = na.allocateVIP(eAttach); err != nil {
196
+				return err
197
+			}
198
+			eVIPs = append(eVIPs, eAttach)
199
+			continue vipLoop
195 200
 
196
-	// Always prefer NetworkAttachmentConfig in the TaskSpec
197
-	specNetworks := s.Spec.Task.Networks
198
-	if len(specNetworks) == 0 && s != nil && len(s.Spec.Networks) != 0 {
199
-		specNetworks = s.Spec.Networks
201
+		}
202
+		for _, nAttach := range specNetworks {
203
+			if nAttach.Target == eAttach.NetworkID {
204
+				if err = na.allocateVIP(eAttach); err != nil {
205
+					return err
206
+				}
207
+				eVIPs = append(eVIPs, eAttach)
208
+				continue vipLoop
209
+			}
210
+		}
211
+		// If the network of the VIP is not part of the service spec,
212
+		// deallocate the vip
213
+		na.deallocateVIP(eAttach)
200 214
 	}
201 215
 
202
-outer:
216
+networkLoop:
203 217
 	for _, nAttach := range specNetworks {
204 218
 		for _, vip := range s.Endpoint.VirtualIPs {
205 219
 			if vip.NetworkID == nAttach.Target {
206
-				continue outer
220
+				continue networkLoop
207 221
 			}
208 222
 		}
209 223
 
210 224
 		vip := &api.Endpoint_VirtualIP{NetworkID: nAttach.Target}
211 225
 		if err = na.allocateVIP(vip); err != nil {
212
-			return
226
+			return err
213 227
 		}
214 228
 
215
-		s.Endpoint.VirtualIPs = append(s.Endpoint.VirtualIPs, vip)
229
+		eVIPs = append(eVIPs, vip)
230
+	}
231
+
232
+	if len(eVIPs) > 0 {
233
+		na.services[s.ID] = struct{}{}
234
+	} else {
235
+		delete(na.services, s.ID)
216 236
 	}
217 237
 
218
-	na.services[s.ID] = struct{}{}
219
-	return
238
+	s.Endpoint.VirtualIPs = eVIPs
239
+	return nil
220 240
 }
221 241
 
222 242
 // ServiceDeallocate de-allocates all the network resources such as
... ...
@@ -234,6 +254,7 @@ func (na *NetworkAllocator) ServiceDeallocate(s *api.Service) error {
234 234
 				WithField("vip.addr", vip.Addr).Error("error deallocating vip")
235 235
 		}
236 236
 	}
237
+	s.Endpoint.VirtualIPs = nil
237 238
 
238 239
 	na.portAllocator.serviceDeallocatePorts(s)
239 240
 	delete(na.services, s.ID)
... ...
@@ -284,10 +305,10 @@ func (na *NetworkAllocator) IsTaskAllocated(t *api.Task) bool {
284 284
 	return true
285 285
 }
286 286
 
287
-// PortsAllocatedInHostPublishMode returns if the passed service has its published ports in
288
-// host (non ingress) mode allocated
289
-func (na *NetworkAllocator) PortsAllocatedInHostPublishMode(s *api.Service) bool {
290
-	return na.portAllocator.portsAllocatedInHostPublishMode(s)
287
+// HostPublishPortsNeedUpdate returns true if the passed service needs
288
+// allocations for its published ports in host (non ingress) mode
289
+func (na *NetworkAllocator) HostPublishPortsNeedUpdate(s *api.Service) bool {
290
+	return na.portAllocator.hostPublishPortsNeedUpdate(s)
291 291
 }
292 292
 
293 293
 // ServiceAllocationOpts is struct used for functional options in IsServiceAllocated
... ...
@@ -300,41 +321,74 @@ func OnInit(options *ServiceAllocationOpts) {
300 300
 	options.OnInit = true
301 301
 }
302 302
 
303
-// IsServiceAllocated returns if the passed service has its network resources allocated or not.
304
-// init bool indicates if the func is called during allocator initialization stage.
305
-func (na *NetworkAllocator) IsServiceAllocated(s *api.Service, flags ...func(*ServiceAllocationOpts)) bool {
303
+// ServiceNeedsAllocation returns true if the passed service needs to have network resources allocated/updated.
304
+func (na *NetworkAllocator) ServiceNeedsAllocation(s *api.Service, flags ...func(*ServiceAllocationOpts)) bool {
306 305
 	var options ServiceAllocationOpts
307
-
308 306
 	for _, flag := range flags {
309 307
 		flag(&options)
310 308
 	}
311 309
 
310
+	specNetworks := serviceNetworks(s)
311
+
312 312
 	// If endpoint mode is VIP and allocator does not have the
313
-	// service in VIP allocated set then it is not allocated.
314
-	if (len(s.Spec.Task.Networks) != 0 || len(s.Spec.Networks) != 0) &&
313
+	// service in VIP allocated set then it needs to be allocated.
314
+	if len(specNetworks) != 0 &&
315 315
 		(s.Spec.Endpoint == nil ||
316 316
 			s.Spec.Endpoint.Mode == api.ResolutionModeVirtualIP) {
317
+
317 318
 		if _, ok := na.services[s.ID]; !ok {
318
-			return false
319
+			return true
320
+		}
321
+
322
+		if s.Endpoint == nil || len(s.Endpoint.VirtualIPs) == 0 {
323
+			return true
324
+		}
325
+
326
+		// If the spec has networks which don't have a corresponding VIP,
327
+		// the service needs to be allocated.
328
+	networkLoop:
329
+		for _, net := range specNetworks {
330
+			for _, vip := range s.Endpoint.VirtualIPs {
331
+				if vip.NetworkID == net.Target {
332
+					continue networkLoop
333
+				}
334
+			}
335
+			return true
336
+		}
337
+	}
338
+
339
+	// If the spec no longer has networks attached and has a vip allocated
340
+	// from previous spec the service needs to allocated.
341
+	if s.Endpoint != nil {
342
+	vipLoop:
343
+		for _, vip := range s.Endpoint.VirtualIPs {
344
+			if na.IsVIPOnIngressNetwork(vip) {
345
+				continue vipLoop
346
+			}
347
+			for _, net := range specNetworks {
348
+				if vip.NetworkID == net.Target {
349
+					continue vipLoop
350
+				}
351
+			}
352
+			return true
319 353
 		}
320 354
 	}
321 355
 
322 356
 	// If the endpoint mode is DNSRR and allocator has the service
323
-	// in VIP allocated set then we return not allocated to make
357
+	// in VIP allocated set then we return to be allocated to make
324 358
 	// sure the allocator triggers networkallocator to free up the
325 359
 	// resources if any.
326 360
 	if s.Spec.Endpoint != nil && s.Spec.Endpoint.Mode == api.ResolutionModeDNSRoundRobin {
327 361
 		if _, ok := na.services[s.ID]; ok {
328
-			return false
362
+			return true
329 363
 		}
330 364
 	}
331 365
 
332 366
 	if (s.Spec.Endpoint != nil && len(s.Spec.Endpoint.Ports) != 0) ||
333 367
 		(s.Endpoint != nil && len(s.Endpoint.Ports) != 0) {
334
-		return na.portAllocator.isPortsAllocatedOnInit(s, options.OnInit)
368
+		return !na.portAllocator.isPortsAllocatedOnInit(s, options.OnInit)
335 369
 	}
336
-
337
-	return true
370
+	return false
338 371
 }
339 372
 
340 373
 // IsNodeAllocated returns if the passed node has its network resources allocated or not.
... ...
@@ -828,3 +882,34 @@ func initializeDrivers(reg *drvregistry.DrvRegistry) error {
828 828
 	}
829 829
 	return nil
830 830
 }
831
+
832
+func serviceNetworks(s *api.Service) []*api.NetworkAttachmentConfig {
833
+	// Always prefer NetworkAttachmentConfig in the TaskSpec
834
+	if len(s.Spec.Task.Networks) == 0 && len(s.Spec.Networks) != 0 {
835
+		return s.Spec.Networks
836
+	}
837
+	return s.Spec.Task.Networks
838
+}
839
+
840
+// IsVIPOnIngressNetwork check if the vip is in ingress network
841
+func (na *NetworkAllocator) IsVIPOnIngressNetwork(vip *api.Endpoint_VirtualIP) bool {
842
+	if vip == nil {
843
+		return false
844
+	}
845
+
846
+	localNet := na.getNetwork(vip.NetworkID)
847
+	if localNet != nil && localNet.nw != nil {
848
+		return IsIngressNetwork(localNet.nw)
849
+	}
850
+	return false
851
+}
852
+
853
+// IsIngressNetwork check if the network is an ingress network
854
+func IsIngressNetwork(nw *api.Network) bool {
855
+	if nw.Spec.Ingress {
856
+		return true
857
+	}
858
+	// Check if legacy defined ingress network
859
+	_, ok := nw.Spec.Annotations.Labels["com.docker.swarm.internal"]
860
+	return ok && nw.Spec.Annotations.Name == "ingress"
861
+}
... ...
@@ -269,9 +269,9 @@ func (pa *portAllocator) serviceDeallocatePorts(s *api.Service) {
269 269
 	s.Endpoint.Ports = nil
270 270
 }
271 271
 
272
-func (pa *portAllocator) portsAllocatedInHostPublishMode(s *api.Service) bool {
272
+func (pa *portAllocator) hostPublishPortsNeedUpdate(s *api.Service) bool {
273 273
 	if s.Endpoint == nil && s.Spec.Endpoint == nil {
274
-		return true
274
+		return false
275 275
 	}
276 276
 
277 277
 	portStates := allocatedPorts{}
... ...
@@ -288,13 +288,13 @@ func (pa *portAllocator) portsAllocatedInHostPublishMode(s *api.Service) bool {
288 288
 			if portConfig.PublishMode == api.PublishModeHost &&
289 289
 				portConfig.PublishedPort != 0 {
290 290
 				if portStates.delState(portConfig) == nil {
291
-					return false
291
+					return true
292 292
 				}
293 293
 			}
294 294
 		}
295 295
 	}
296 296
 
297
-	return true
297
+	return false
298 298
 }
299 299
 
300 300
 func (pa *portAllocator) isPortsAllocated(s *api.Service) bool {
... ...
@@ -2,7 +2,6 @@ package controlapi
2 2
 
3 3
 import (
4 4
 	"errors"
5
-	"path/filepath"
6 5
 	"reflect"
7 6
 	"strconv"
8 7
 	"strings"
... ...
@@ -30,6 +29,8 @@ var (
30 30
 	errModeChangeNotAllowed      = errors.New("service mode change is not allowed")
31 31
 )
32 32
 
33
+const minimumDuration = 1 * time.Millisecond
34
+
33 35
 func validateResources(r *api.Resources) error {
34 36
 	if r == nil {
35 37
 		return nil
... ...
@@ -143,16 +144,37 @@ func validateContainerSpec(taskSpec api.TaskSpec) error {
143 143
 		return grpc.Errorf(codes.InvalidArgument, err.Error())
144 144
 	}
145 145
 
146
-	if container.Image == "" {
146
+	if err := validateImage(container.Image); err != nil {
147
+		return err
148
+	}
149
+
150
+	if err := validateMounts(container.Mounts); err != nil {
151
+		return err
152
+	}
153
+
154
+	if err := validateHealthCheck(container.Healthcheck); err != nil {
155
+		return err
156
+	}
157
+
158
+	return nil
159
+}
160
+
161
+// validateImage validates image name in containerSpec
162
+func validateImage(image string) error {
163
+	if image == "" {
147 164
 		return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: image reference must be provided")
148 165
 	}
149 166
 
150
-	if _, err := reference.ParseNormalizedNamed(container.Image); err != nil {
151
-		return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: %q is not a valid repository/tag", container.Image)
167
+	if _, err := reference.ParseNormalizedNamed(image); err != nil {
168
+		return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: %q is not a valid repository/tag", image)
152 169
 	}
170
+	return nil
171
+}
153 172
 
173
+// validateMounts validates if there are duplicate mounts in containerSpec
174
+func validateMounts(mounts []api.Mount) error {
154 175
 	mountMap := make(map[string]bool)
155
-	for _, mount := range container.Mounts {
176
+	for _, mount := range mounts {
156 177
 		if _, exists := mountMap[mount.Target]; exists {
157 178
 			return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: duplicate mount point: %s", mount.Target)
158 179
 		}
... ...
@@ -162,6 +184,49 @@ func validateContainerSpec(taskSpec api.TaskSpec) error {
162 162
 	return nil
163 163
 }
164 164
 
165
+// validateHealthCheck validates configs about container's health check
166
+func validateHealthCheck(hc *api.HealthConfig) error {
167
+	if hc == nil {
168
+		return nil
169
+	}
170
+
171
+	if hc.Interval != nil {
172
+		interval, err := gogotypes.DurationFromProto(hc.Interval)
173
+		if err != nil {
174
+			return err
175
+		}
176
+		if interval != 0 && interval < time.Duration(minimumDuration) {
177
+			return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: Interval in HealthConfig cannot be less than %s", minimumDuration)
178
+		}
179
+	}
180
+
181
+	if hc.Timeout != nil {
182
+		timeout, err := gogotypes.DurationFromProto(hc.Timeout)
183
+		if err != nil {
184
+			return err
185
+		}
186
+		if timeout != 0 && timeout < time.Duration(minimumDuration) {
187
+			return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: Timeout in HealthConfig cannot be less than %s", minimumDuration)
188
+		}
189
+	}
190
+
191
+	if hc.StartPeriod != nil {
192
+		sp, err := gogotypes.DurationFromProto(hc.StartPeriod)
193
+		if err != nil {
194
+			return err
195
+		}
196
+		if sp != 0 && sp < time.Duration(minimumDuration) {
197
+			return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: StartPeriod in HealthConfig cannot be less than %s", minimumDuration)
198
+		}
199
+	}
200
+
201
+	if hc.Retries < 0 {
202
+		return grpc.Errorf(codes.InvalidArgument, "ContainerSpec: Retries in HealthConfig cannot be negative")
203
+	}
204
+
205
+	return nil
206
+}
207
+
165 208
 func validateGenericRuntimeSpec(taskSpec api.TaskSpec) error {
166 209
 	generic := taskSpec.GetGeneric()
167 210
 
... ...
@@ -302,11 +367,9 @@ func validateSecretRefsSpec(spec api.TaskSpec) error {
302 302
 		// If this is a file target, we will ensure filename uniqueness
303 303
 		if secretRef.GetFile() != nil {
304 304
 			fileName := secretRef.GetFile().Name
305
-			// Validate the file name
306
-			if fileName == "" || fileName != filepath.Base(filepath.Clean(fileName)) {
305
+			if fileName == "" {
307 306
 				return grpc.Errorf(codes.InvalidArgument, "malformed file secret reference, invalid target file name provided")
308 307
 			}
309
-
310 308
 			// If this target is already in use, we have conflicting targets
311 309
 			if prevSecretName, ok := existingTargets[fileName]; ok {
312 310
 				return grpc.Errorf(codes.InvalidArgument, "secret references '%s' and '%s' have a conflicting target: '%s'", prevSecretName, secretRef.SecretName, fileName)
... ...
@@ -333,7 +333,7 @@ func (d *Dispatcher) markNodesUnknown(ctx context.Context) error {
333 333
 	if err != nil {
334 334
 		return errors.Wrap(err, "failed to get list of nodes")
335 335
 	}
336
-	_, err = d.store.Batch(func(batch *store.Batch) error {
336
+	err = d.store.Batch(func(batch *store.Batch) error {
337 337
 		for _, n := range nodes {
338 338
 			err := batch.Update(func(tx store.Tx) error {
339 339
 				// check if node is still here
... ...
@@ -600,7 +600,7 @@ func (d *Dispatcher) processUpdates(ctx context.Context) {
600 600
 		"method": "(*Dispatcher).processUpdates",
601 601
 	})
602 602
 
603
-	_, err := d.store.Batch(func(batch *store.Batch) error {
603
+	err := d.store.Batch(func(batch *store.Batch) error {
604 604
 		for taskID, status := range taskUpdates {
605 605
 			err := batch.Update(func(tx store.Tx) error {
606 606
 				logger := log.WithField("task.id", taskID)
... ...
@@ -951,7 +951,7 @@ func (d *Dispatcher) Assignments(r *api.AssignmentsRequest, stream api.Dispatche
951 951
 }
952 952
 
953 953
 func (d *Dispatcher) moveTasksToOrphaned(nodeID string) error {
954
-	_, err := d.store.Batch(func(batch *store.Batch) error {
954
+	err := d.store.Batch(func(batch *store.Batch) error {
955 955
 		var (
956 956
 			tasks []*api.Task
957 957
 			err   error
... ...
@@ -1151,6 +1151,9 @@ func (d *Dispatcher) Session(r *api.SessionRequest, stream api.Dispatcher_Sessio
1151 1151
 		return err
1152 1152
 	}
1153 1153
 
1154
+	clusterUpdatesCh, clusterCancel := d.clusterUpdateQueue.Watch()
1155
+	defer clusterCancel()
1156
+
1154 1157
 	if err := stream.Send(&api.SessionMessage{
1155 1158
 		SessionID:            sessionID,
1156 1159
 		Node:                 nodeObj,
... ...
@@ -1161,9 +1164,6 @@ func (d *Dispatcher) Session(r *api.SessionRequest, stream api.Dispatcher_Sessio
1161 1161
 		return err
1162 1162
 	}
1163 1163
 
1164
-	clusterUpdatesCh, clusterCancel := d.clusterUpdateQueue.Watch()
1165
-	defer clusterCancel()
1166
-
1167 1164
 	// disconnectNode is a helper forcibly shutdown connection
1168 1165
 	disconnectNode := func() error {
1169 1166
 		// force disconnect by shutting down the stream.
... ...
@@ -129,7 +129,7 @@ func (ce *ConstraintEnforcer) rejectNoncompliantTasks(node *api.Node) {
129 129
 	}
130 130
 
131 131
 	if len(removeTasks) != 0 {
132
-		_, err := ce.store.Batch(func(batch *store.Batch) error {
132
+		err := ce.store.Batch(func(batch *store.Batch) error {
133 133
 			for _, t := range removeTasks {
134 134
 				err := batch.Update(func(tx store.Tx) error {
135 135
 					t = store.GetTask(tx, t.ID)
... ...
@@ -249,7 +249,7 @@ func (g *Orchestrator) removeTasksFromNode(ctx context.Context, node *api.Node)
249 249
 		return
250 250
 	}
251 251
 
252
-	_, err = g.store.Batch(func(batch *store.Batch) error {
252
+	err = g.store.Batch(func(batch *store.Batch) error {
253 253
 		for _, t := range tasks {
254 254
 			// Global orchestrator only removes tasks from globalServices
255 255
 			if _, exists := g.globalServices[t.ServiceID]; exists {
... ...
@@ -296,7 +296,7 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin
296 296
 
297 297
 	updates := make(map[*api.Service][]orchestrator.Slot)
298 298
 
299
-	_, err := g.store.Batch(func(batch *store.Batch) error {
299
+	err := g.store.Batch(func(batch *store.Batch) error {
300 300
 		for _, serviceID := range serviceIDs {
301 301
 			var updateTasks []orchestrator.Slot
302 302
 
... ...
@@ -433,7 +433,7 @@ func (g *Orchestrator) reconcileServicesOneNode(ctx context.Context, serviceIDs
433 433
 		}
434 434
 	}
435 435
 
436
-	_, err = g.store.Batch(func(batch *store.Batch) error {
436
+	err = g.store.Batch(func(batch *store.Batch) error {
437 437
 		for _, serviceID := range serviceIDs {
438 438
 			service, exists := g.globalServices[serviceID]
439 439
 			if !exists {
... ...
@@ -505,7 +505,7 @@ func (g *Orchestrator) tickTasks(ctx context.Context) {
505 505
 	if len(g.restartTasks) == 0 {
506 506
 		return
507 507
 	}
508
-	_, err := g.store.Batch(func(batch *store.Batch) error {
508
+	err := g.store.Batch(func(batch *store.Batch) error {
509 509
 		for taskID := range g.restartTasks {
510 510
 			err := batch.Update(func(tx store.Tx) error {
511 511
 				t := store.GetTask(tx, taskID)
... ...
@@ -108,7 +108,7 @@ func (r *Orchestrator) reconcile(ctx context.Context, service *api.Service) {
108 108
 		log.G(ctx).Debugf("Service %s was scaled up from %d to %d instances", service.ID, numSlots, specifiedSlots)
109 109
 		// Update all current tasks then add missing tasks
110 110
 		r.updater.Update(ctx, r.cluster, service, slotsSlice)
111
-		_, err = r.store.Batch(func(batch *store.Batch) error {
111
+		err = r.store.Batch(func(batch *store.Batch) error {
112 112
 			r.addTasks(ctx, batch, service, runningSlots, deadSlots, specifiedSlots-uint64(numSlots))
113 113
 			r.deleteTasksMap(ctx, batch, deadSlots)
114 114
 			return nil
... ...
@@ -155,7 +155,7 @@ func (r *Orchestrator) reconcile(ctx context.Context, service *api.Service) {
155 155
 		}
156 156
 
157 157
 		r.updater.Update(ctx, r.cluster, service, sortedSlots[:specifiedSlots])
158
-		_, err = r.store.Batch(func(batch *store.Batch) error {
158
+		err = r.store.Batch(func(batch *store.Batch) error {
159 159
 			r.deleteTasksMap(ctx, batch, deadSlots)
160 160
 			r.deleteTasks(ctx, batch, sortedSlots[specifiedSlots:])
161 161
 			return nil
... ...
@@ -165,7 +165,7 @@ func (r *Orchestrator) reconcile(ctx context.Context, service *api.Service) {
165 165
 		}
166 166
 
167 167
 	case specifiedSlots == uint64(numSlots):
168
-		_, err = r.store.Batch(func(batch *store.Batch) error {
168
+		err = r.store.Batch(func(batch *store.Batch) error {
169 169
 			r.deleteTasksMap(ctx, batch, deadSlots)
170 170
 			return nil
171 171
 		})
... ...
@@ -45,7 +45,7 @@ func (r *Orchestrator) handleTaskEvent(ctx context.Context, event events.Event)
45 45
 
46 46
 func (r *Orchestrator) tickTasks(ctx context.Context) {
47 47
 	if len(r.restartTasks) > 0 {
48
-		_, err := r.store.Batch(func(batch *store.Batch) error {
48
+		err := r.store.Batch(func(batch *store.Batch) error {
49 49
 			for taskID := range r.restartTasks {
50 50
 				err := batch.Update(func(tx store.Tx) error {
51 51
 					// TODO(aaronl): optimistic update?
... ...
@@ -41,7 +41,7 @@ func DeleteServiceTasks(ctx context.Context, s *store.MemoryStore, service *api.
41 41
 		return
42 42
 	}
43 43
 
44
-	_, err = s.Batch(func(batch *store.Batch) error {
44
+	err = s.Batch(func(batch *store.Batch) error {
45 45
 		for _, t := range tasks {
46 46
 			err := batch.Update(func(tx store.Tx) error {
47 47
 				if err := store.DeleteTask(tx, t.ID); err != nil {
... ...
@@ -21,7 +21,7 @@ type InitHandler interface {
21 21
 // CheckTasks fixes tasks in the store before orchestrator runs. The previous leader might
22 22
 // not have finished processing their updates and left them in an inconsistent state.
23 23
 func CheckTasks(ctx context.Context, s *store.MemoryStore, readTx store.ReadTx, initHandler InitHandler, startSupervisor *restart.Supervisor) error {
24
-	_, err := s.Batch(func(batch *store.Batch) error {
24
+	err := s.Batch(func(batch *store.Batch) error {
25 25
 		tasks, err := store.FindTasks(readTx, store.All)
26 26
 		if err != nil {
27 27
 			return err
... ...
@@ -378,7 +378,7 @@ func (u *Updater) updateTask(ctx context.Context, slot orchestrator.Slot, update
378 378
 	startThenStop := false
379 379
 	var delayStartCh <-chan struct{}
380 380
 	// Atomically create the updated task and bring down the old one.
381
-	_, err := u.store.Batch(func(batch *store.Batch) error {
381
+	err := u.store.Batch(func(batch *store.Batch) error {
382 382
 		err := batch.Update(func(tx store.Tx) error {
383 383
 			if store.GetService(tx, updated.ServiceID) == nil {
384 384
 				return errors.New("service was deleted")
... ...
@@ -431,7 +431,7 @@ func (u *Updater) updateTask(ctx context.Context, slot orchestrator.Slot, update
431 431
 				u.updatedTasksMu.Unlock()
432 432
 
433 433
 				if startThenStop {
434
-					_, err := u.store.Batch(func(batch *store.Batch) error {
434
+					err := u.store.Batch(func(batch *store.Batch) error {
435 435
 						_, err := u.removeOldTasks(ctx, batch, slot)
436 436
 						if err != nil {
437 437
 							log.G(ctx).WithError(err).WithField("task.id", updated.ID).Warning("failed to remove old task after starting replacement")
... ...
@@ -457,7 +457,7 @@ func (u *Updater) useExistingTask(ctx context.Context, slot orchestrator.Slot, e
457 457
 	}
458 458
 	if len(removeTasks) != 0 || existing.DesiredState != api.TaskStateRunning {
459 459
 		var delayStartCh <-chan struct{}
460
-		_, err := u.store.Batch(func(batch *store.Batch) error {
460
+		err := u.store.Batch(func(batch *store.Batch) error {
461 461
 			var oldTask *api.Task
462 462
 			if len(removeTasks) != 0 {
463 463
 				var err error
... ...
@@ -394,7 +394,7 @@ func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDeci
394 394
 	successful = make([]schedulingDecision, 0, len(schedulingDecisions))
395 395
 
396 396
 	// Apply changes to master store
397
-	applied, err := s.store.Batch(func(batch *store.Batch) error {
397
+	err := s.store.Batch(func(batch *store.Batch) error {
398 398
 		for len(schedulingDecisions) > 0 {
399 399
 			err := batch.Update(func(tx store.Tx) error {
400 400
 				// Update exactly one task inside this Update
... ...
@@ -452,8 +452,8 @@ func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDeci
452 452
 
453 453
 	if err != nil {
454 454
 		log.G(ctx).WithError(err).Error("scheduler tick transaction failed")
455
-		failed = append(failed, successful[applied:]...)
456
-		successful = successful[:applied]
455
+		failed = append(failed, successful...)
456
+		successful = nil
457 457
 	}
458 458
 	return
459 459
 }
460 460
deleted file mode 100644
... ...
@@ -1,32 +0,0 @@
1
-// Package state provides interfaces to work with swarm cluster state.
2
-//
3
-// The primary interface is Store, which abstracts storage of this cluster
4
-// state. Store exposes a transactional interface for both reads and writes.
5
-// To perform a read transaction, View accepts a callback function that it
6
-// will invoke with a ReadTx object that gives it a consistent view of the
7
-// state. Similarly, Update accepts a callback function that it will invoke with
8
-// a Tx object that allows reads and writes to happen without interference from
9
-// other transactions.
10
-//
11
-// This is an example of making an update to a Store:
12
-//
13
-//	err := store.Update(func(tx state.Tx) {
14
-//		if err := tx.Nodes().Update(newNode); err != nil {
15
-//			return err
16
-//		}
17
-//		return nil
18
-//	})
19
-//	if err != nil {
20
-//		return fmt.Errorf("transaction failed: %v", err)
21
-//	}
22
-//
23
-// WatchableStore is a version of Store that exposes watch functionality.
24
-// These expose a publish/subscribe queue where code can subscribe to
25
-// changes of interest. This can be combined with the ViewAndWatch function to
26
-// "fork" a store, by making a snapshot and then applying future changes
27
-// to keep the copy in sync. This approach lets consumers of the data
28
-// use their own data structures and implement their own concurrency
29
-// strategies. It can lead to more efficient code because data consumers
30
-// don't necessarily have to lock the main data store if they are
31
-// maintaining their own copies of the state.
32
-package state
... ...
@@ -58,6 +58,10 @@ var (
58 58
 	// ErrMemberUnknown is sent in response to a message from an
59 59
 	// unrecognized peer.
60 60
 	ErrMemberUnknown = errors.New("raft: member unknown")
61
+
62
+	// work around lint
63
+	lostQuorumMessage = "The swarm does not have a leader. It's possible that too few managers are online. Make sure more than half of the managers are online."
64
+	errLostQuorum     = errors.New(lostQuorumMessage)
61 65
 )
62 66
 
63 67
 // LeadershipState indicates whether the node is a leader or follower.
... ...
@@ -68,6 +72,10 @@ const (
68 68
 	IsLeader LeadershipState = iota
69 69
 	// IsFollower indicates that the node is a raft follower.
70 70
 	IsFollower
71
+
72
+	// lostQuorumTimeout is the number of ticks that can elapse with no
73
+	// leader before LeaderConn starts returning an error right away.
74
+	lostQuorumTimeout = 10
71 75
 )
72 76
 
73 77
 // EncryptionKeys are the current and, if necessary, pending DEKs with which to
... ...
@@ -143,6 +151,7 @@ type Node struct {
143 143
 	rotationQueued      bool
144 144
 	clearData           bool
145 145
 	waitForAppliedIndex uint64
146
+	ticksWithNoLeader   uint32
146 147
 }
147 148
 
148 149
 // NodeOptions provides node-level options.
... ...
@@ -207,6 +216,7 @@ func NewNode(opts NodeOptions) *Node {
207 207
 			MaxSizePerMsg:   cfg.MaxSizePerMsg,
208 208
 			MaxInflightMsgs: cfg.MaxInflightMsgs,
209 209
 			Logger:          cfg.Logger,
210
+			CheckQuorum:     cfg.CheckQuorum,
210 211
 		},
211 212
 		doneCh:              make(chan struct{}),
212 213
 		RemovedFromRaft:     make(chan struct{}),
... ...
@@ -528,6 +538,12 @@ func (n *Node) Run(ctx context.Context) error {
528 528
 		select {
529 529
 		case <-n.ticker.C():
530 530
 			n.raftNode.Tick()
531
+
532
+			if n.leader() == raft.None {
533
+				atomic.AddUint32(&n.ticksWithNoLeader, 1)
534
+			} else {
535
+				atomic.StoreUint32(&n.ticksWithNoLeader, 0)
536
+			}
531 537
 		case rd := <-n.raftNode.Ready():
532 538
 			raftConfig := n.getCurrentRaftConfig()
533 539
 
... ...
@@ -698,9 +714,7 @@ func (n *Node) restoreFromSnapshot(ctx context.Context, data []byte) error {
698 698
 
699 699
 	for _, removedMember := range snapCluster.Removed {
700 700
 		n.cluster.RemoveMember(removedMember)
701
-		if err := n.transport.RemovePeer(removedMember); err != nil {
702
-			log.G(ctx).WithError(err).Errorf("failed to remove peer %x from transport", removedMember)
703
-		}
701
+		n.transport.RemovePeer(removedMember)
704 702
 		delete(oldMembers, removedMember)
705 703
 	}
706 704
 
... ...
@@ -1356,6 +1370,10 @@ func (n *Node) getLeaderConn() (*grpc.ClientConn, error) {
1356 1356
 // LeaderConn returns current connection to cluster leader or raftselector.ErrIsLeader
1357 1357
 // if current machine is leader.
1358 1358
 func (n *Node) LeaderConn(ctx context.Context) (*grpc.ClientConn, error) {
1359
+	if atomic.LoadUint32(&n.ticksWithNoLeader) > lostQuorumTimeout {
1360
+		return nil, errLostQuorum
1361
+	}
1362
+
1359 1363
 	cc, err := n.getLeaderConn()
1360 1364
 	if err == nil {
1361 1365
 		return cc, nil
1362 1366
new file mode 100644
... ...
@@ -0,0 +1,32 @@
0
+// Package store provides interfaces to work with swarm cluster state.
1
+//
2
+// The primary interface is MemoryStore, which abstracts storage of this cluster
3
+// state. MemoryStore exposes a transactional interface for both reads and writes.
4
+// To perform a read transaction, View accepts a callback function that it
5
+// will invoke with a ReadTx object that gives it a consistent view of the
6
+// state. Similarly, Update accepts a callback function that it will invoke with
7
+// a Tx object that allows reads and writes to happen without interference from
8
+// other transactions.
9
+//
10
+// This is an example of making an update to a MemoryStore:
11
+//
12
+//	err := store.Update(func(tx store.Tx) {
13
+//		if err := tx.Nodes().Update(newNode); err != nil {
14
+//			return err
15
+//		}
16
+//		return nil
17
+//	})
18
+//	if err != nil {
19
+//		return fmt.Errorf("transaction failed: %v", err)
20
+//	}
21
+//
22
+// MemoryStore exposes watch functionality.
23
+// It exposes a publish/subscribe queue where code can subscribe to
24
+// changes of interest. This can be combined with the ViewAndWatch function to
25
+// "fork" a store, by making a snapshot and then applying future changes
26
+// to keep the copy in sync. This approach lets consumers of the data
27
+// use their own data structures and implement their own concurrency
28
+// strategies. It can lead to more efficient code because data consumers
29
+// don't necessarily have to lock the main data store if they are
30
+// maintaining their own copies of the state.
31
+package store
... ...
@@ -348,9 +348,6 @@ type Batch struct {
348 348
 	store *MemoryStore
349 349
 	// applied counts the times Update has run successfully
350 350
 	applied int
351
-	// committed is the number of times Update had run successfully as of
352
-	// the time pending changes were committed.
353
-	committed int
354 351
 	// transactionSizeEstimate is the running count of the size of the
355 352
 	// current transaction.
356 353
 	transactionSizeEstimate int
... ...
@@ -434,8 +431,6 @@ func (batch *Batch) commit() error {
434 434
 		return batch.err
435 435
 	}
436 436
 
437
-	batch.committed = batch.applied
438
-
439 437
 	for _, c := range batch.tx.changelist {
440 438
 		batch.store.queue.Publish(c)
441 439
 	}
... ...
@@ -461,9 +456,9 @@ func (batch *Batch) commit() error {
461 461
 // excessive time, or producing a transaction that exceeds the maximum
462 462
 // size.
463 463
 //
464
-// Batch returns the number of calls to batch.Update whose changes were
465
-// successfully committed to the store.
466
-func (s *MemoryStore) Batch(cb func(*Batch) error) (int, error) {
464
+// If Batch returns an error, no guarantees are made about how many updates
465
+// were committed successfully.
466
+func (s *MemoryStore) Batch(cb func(*Batch) error) error {
467 467
 	s.updateLock.Lock()
468 468
 
469 469
 	batch := Batch{
... ...
@@ -474,12 +469,12 @@ func (s *MemoryStore) Batch(cb func(*Batch) error) (int, error) {
474 474
 	if err := cb(&batch); err != nil {
475 475
 		batch.tx.memDBTx.Abort()
476 476
 		s.updateLock.Unlock()
477
-		return batch.committed, err
477
+		return err
478 478
 	}
479 479
 
480 480
 	err := batch.commit()
481 481
 	s.updateLock.Unlock()
482
-	return batch.committed, err
482
+	return err
483 483
 }
484 484
 
485 485
 func (tx *tx) init(memDBTx *memdb.Txn, curVersion *api.Version) {
... ...
@@ -133,29 +133,17 @@ type Node struct {
133 133
 	manager          *manager.Manager
134 134
 	notifyNodeChange chan *agent.NodeChanges // used by the agent to relay node updates from the dispatcher Session stream to (*Node).run
135 135
 	unlockKey        []byte
136
-
137
-	// lastNodeRole is the last-seen value of Node.Role, used to make role
138
-	// changes "edge triggered" and avoid renewal loops.
139
-	lastNodeRole lastSeenRole
140
-	// lastNodeDesiredRole is the last-seen value of Node.Spec.DesiredRole,
141
-	// used to make role changes "edge triggered" and avoid renewal loops.
142
-	// This exists in addition to lastNodeRole to support older CAs that
143
-	// only fill in the DesiredRole field.
144
-	lastNodeDesiredRole lastSeenRole
145 136
 }
146 137
 
147 138
 type lastSeenRole struct {
148
-	role *api.NodeRole
139
+	role api.NodeRole
149 140
 }
150 141
 
151 142
 // observe notes the latest value of this node role, and returns true if it
152 143
 // is the first seen value, or is different from the most recently seen value.
153 144
 func (l *lastSeenRole) observe(newRole api.NodeRole) bool {
154
-	changed := l.role == nil || *l.role != newRole
155
-	if l.role == nil {
156
-		l.role = new(api.NodeRole)
157
-	}
158
-	*l.role = newRole
145
+	changed := l.role != newRole
146
+	l.role = newRole
159 147
 	return changed
160 148
 }
161 149
 
... ...
@@ -244,6 +232,16 @@ func (n *Node) Start(ctx context.Context) error {
244 244
 	return err
245 245
 }
246 246
 
247
+func (n *Node) currentRole() api.NodeRole {
248
+	n.Lock()
249
+	currentRole := api.NodeRoleWorker
250
+	if n.role == ca.ManagerRole {
251
+		currentRole = api.NodeRoleManager
252
+	}
253
+	n.Unlock()
254
+	return currentRole
255
+}
256
+
247 257
 func (n *Node) run(ctx context.Context) (err error) {
248 258
 	defer func() {
249 259
 		n.err = err
... ...
@@ -267,9 +265,11 @@ func (n *Node) run(ctx context.Context) (err error) {
267 267
 		return err
268 268
 	}
269 269
 
270
+	renewer := ca.NewTLSRenewer(securityConfig, n.connBroker)
271
+
270 272
 	ctx = log.WithLogger(ctx, log.G(ctx).WithField("node.id", n.NodeID()))
271 273
 
272
-	taskDBPath := filepath.Join(n.config.StateDir, "worker/tasks.db")
274
+	taskDBPath := filepath.Join(n.config.StateDir, "worker", "tasks.db")
273 275
 	if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil {
274 276
 		return err
275 277
 	}
... ...
@@ -282,57 +282,39 @@ func (n *Node) run(ctx context.Context) (err error) {
282 282
 
283 283
 	agentDone := make(chan struct{})
284 284
 
285
-	forceCertRenewal := make(chan struct{})
286
-	renewCert := func() {
287
-		for {
288
-			select {
289
-			case forceCertRenewal <- struct{}{}:
290
-				return
291
-			case <-agentDone:
292
-				return
293
-			case <-n.notifyNodeChange:
294
-				// consume from the channel to avoid blocking the writer
295
-			}
296
-		}
297
-	}
298
-
299 285
 	go func() {
286
+		// lastNodeDesiredRole is the last-seen value of Node.Spec.DesiredRole,
287
+		// used to make role changes "edge triggered" and avoid renewal loops.
288
+		lastNodeDesiredRole := lastSeenRole{role: n.currentRole()}
289
+
300 290
 		for {
301 291
 			select {
302 292
 			case <-agentDone:
303 293
 				return
304 294
 			case nodeChanges := <-n.notifyNodeChange:
305
-				n.Lock()
306
-				currentRole := api.NodeRoleWorker
307
-				if n.role == ca.ManagerRole {
308
-					currentRole = api.NodeRoleManager
309
-				}
310
-				n.Unlock()
295
+				currentRole := n.currentRole()
311 296
 
312 297
 				if nodeChanges.Node != nil {
313 298
 					// This is a bit complex to be backward compatible with older CAs that
314 299
 					// don't support the Node.Role field. They only use what's presently
315 300
 					// called DesiredRole.
316
-					// 1) If we haven't seen the node object before, and the desired role
317
-					//    is different from our current role, renew the cert. This covers
318
-					//    the case of starting up after a role change.
319
-					// 2) If we have seen the node before, the desired role is
320
-					//    different from our current role, and either the actual role or
321
-					//    desired role has changed relative to the last values we saw in
322
-					//    those fields, renew the cert. This covers the case of the role
323
-					//    changing while this node is running, but prevents getting into a
324
-					//    rotation loop if Node.Role isn't what we expect (because it's
325
-					//    unset). We may renew the certificate an extra time (first when
326
-					//    DesiredRole changes, and then again when Role changes).
327
-					// 3) If the server is sending us IssuanceStateRotate, renew the cert as
301
+					// 1) If DesiredRole changes, kick off a certificate renewal. The renewal
302
+					//    is delayed slightly to give Role time to change as well if this is
303
+					//    a newer CA. If the certificate we get back doesn't have the expected
304
+					//    role, we continue renewing with exponential backoff.
305
+					// 2) If the server is sending us IssuanceStateRotate, renew the cert as
328 306
 					//    requested by the CA.
329
-					roleChanged := n.lastNodeRole.observe(nodeChanges.Node.Role)
330
-					desiredRoleChanged := n.lastNodeDesiredRole.observe(nodeChanges.Node.Spec.DesiredRole)
331
-					if (currentRole != nodeChanges.Node.Spec.DesiredRole &&
332
-						((roleChanged && currentRole != nodeChanges.Node.Role) ||
333
-							desiredRoleChanged)) ||
334
-						nodeChanges.Node.Certificate.Status.State == api.IssuanceStateRotate {
335
-						renewCert()
307
+					desiredRoleChanged := lastNodeDesiredRole.observe(nodeChanges.Node.Spec.DesiredRole)
308
+					if desiredRoleChanged {
309
+						switch nodeChanges.Node.Spec.DesiredRole {
310
+						case api.NodeRoleManager:
311
+							renewer.SetExpectedRole(ca.ManagerRole)
312
+						case api.NodeRoleWorker:
313
+							renewer.SetExpectedRole(ca.WorkerRole)
314
+						}
315
+					}
316
+					if desiredRoleChanged || nodeChanges.Node.Certificate.Status.State == api.IssuanceStateRotate {
317
+						renewer.Renew()
336 318
 					}
337 319
 				}
338 320
 
... ...
@@ -364,7 +346,7 @@ func (n *Node) run(ctx context.Context) (err error) {
364 364
 	var wg sync.WaitGroup
365 365
 	wg.Add(3)
366 366
 
367
-	updates := ca.RenewTLSConfig(ctx, securityConfig, n.connBroker, forceCertRenewal)
367
+	updates := renewer.Start(ctx)
368 368
 	go func() {
369 369
 		for certUpdate := range updates {
370 370
 			if certUpdate.Err != nil {
... ...
@@ -387,7 +369,7 @@ func (n *Node) run(ctx context.Context) (err error) {
387 387
 	var managerErr error
388 388
 	var agentErr error
389 389
 	go func() {
390
-		managerErr = n.superviseManager(ctx, securityConfig, paths.RootCA, managerReady, forceCertRenewal) // store err and loop
390
+		managerErr = n.superviseManager(ctx, securityConfig, paths.RootCA, managerReady, renewer) // store err and loop
391 391
 		wg.Done()
392 392
 		cancel()
393 393
 	}()
... ...
@@ -869,7 +851,7 @@ func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig
869 869
 	return clearData, nil
870 870
 }
871 871
 
872
-func (n *Node) superviseManager(ctx context.Context, securityConfig *ca.SecurityConfig, rootPaths ca.CertPaths, ready chan struct{}, forceCertRenewal chan struct{}) error {
872
+func (n *Node) superviseManager(ctx context.Context, securityConfig *ca.SecurityConfig, rootPaths ca.CertPaths, ready chan struct{}, renewer *ca.TLSRenewer) error {
873 873
 	for {
874 874
 		if err := n.waitRole(ctx, ca.ManagerRole); err != nil {
875 875
 			return err
... ...
@@ -924,14 +906,7 @@ func (n *Node) superviseManager(ctx context.Context, securityConfig *ca.Security
924 924
 			log.G(ctx).Warn("failed to get worker role after manager stop, forcing certificate renewal")
925 925
 			timer.Reset(roleChangeTimeout)
926 926
 
927
-			select {
928
-			case forceCertRenewal <- struct{}{}:
929
-			case <-timer.C:
930
-				log.G(ctx).Warn("failed to trigger certificate renewal after manager stop, restarting manager")
931
-				return nil
932
-			case <-ctx.Done():
933
-				return ctx.Err()
934
-			}
927
+			renewer.Renew()
935 928
 
936 929
 			// Now that the renewal request has been sent to the
937 930
 			// renewal goroutine, wait for a change in role.
... ...
@@ -8,7 +8,7 @@ github.com/matttproud/golang_protobuf_extensions v1.0.0
8 8
 github.com/grpc-ecosystem/go-grpc-prometheus 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
9 9
 
10 10
 # etcd/raft
11
-github.com/coreos/etcd 824277cb3a577a0e8c829ca9ec557b973fe06d20
11
+github.com/coreos/etcd ea5389a79f40206170582c1ea076191b8622cb8e https://github.com/aaronlehmann/etcd # for https://github.com/coreos/etcd/pull/7830
12 12
 github.com/coreos/go-systemd v12
13 13
 github.com/coreos/pkg v3
14 14
 github.com/prometheus/client_golang 52437c81da6b127a9925d17eb3a382a2e5fd395e