Browse code

cli: Add options for Raft snapshotting

Add the following options to "swarm init" and "swarm update":

- --max-snapshots: Retain this many old Raft snapshots in addition
to the latest one

- --snapshot-interval: Number of log entries between Raft snapshots

These options already existed in SwarmKit and the Docker API but were
never exposed in the CLI. I'm adding them here to fix this oversight.

--max-snapshots may be useful for debugging purposes and more
conservative users who want to store rolling backups of old versions of
the Raft state.

--snapshot-interval is most useful for performance tuning. The default
value of 10000 may not be ideal for some setups.

There is also a LogEntriesForSlowFollowers option that is not exposed. I
decided not to expose it along with these others because I don't think
it's generally useful (and I'm not sure what I would call the CLI flag).
But if people want, I can expose it for the sake of completeness.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>

Aaron Lehmann authored on 2016/11/03 04:29:51
Showing 9 changed files
... ...
@@ -60,7 +60,7 @@ type RaftConfig struct {
60 60
 
61 61
 	// KeepOldSnapshots is the number of snapshots to keep beyond the
62 62
 	// current snapshot.
63
-	KeepOldSnapshots uint64 `json:",omitempty"`
63
+	KeepOldSnapshots *uint64 `json:",omitempty"`
64 64
 
65 65
 	// LogEntriesForSlowFollowers is the number of log entries to keep
66 66
 	// around to sync up slow followers after a snapshot is created.
... ...
@@ -24,6 +24,8 @@ const (
24 24
 	flagToken               = "token"
25 25
 	flagTaskHistoryLimit    = "task-history-limit"
26 26
 	flagExternalCA          = "external-ca"
27
+	flagMaxSnapshots        = "max-snapshots"
28
+	flagSnapshotInterval    = "snapshot-interval"
27 29
 )
28 30
 
29 31
 type swarmOptions struct {
... ...
@@ -31,6 +33,8 @@ type swarmOptions struct {
31 31
 	dispatcherHeartbeat time.Duration
32 32
 	nodeCertExpiry      time.Duration
33 33
 	externalCA          ExternalCAOption
34
+	maxSnapshots        uint64
35
+	snapshotInterval    uint64
34 36
 }
35 37
 
36 38
 // NodeAddrOption is a pflag.Value for listening addresses
... ...
@@ -167,11 +171,11 @@ func addSwarmFlags(flags *pflag.FlagSet, opts *swarmOptions) {
167 167
 	flags.DurationVar(&opts.dispatcherHeartbeat, flagDispatcherHeartbeat, time.Duration(5*time.Second), "Dispatcher heartbeat period")
168 168
 	flags.DurationVar(&opts.nodeCertExpiry, flagCertExpiry, time.Duration(90*24*time.Hour), "Validity period for node certificates")
169 169
 	flags.Var(&opts.externalCA, flagExternalCA, "Specifications of one or more certificate signing endpoints")
170
+	flags.Uint64Var(&opts.maxSnapshots, flagMaxSnapshots, 0, "Number of additional Raft snapshots to retain")
171
+	flags.Uint64Var(&opts.snapshotInterval, flagSnapshotInterval, 10000, "Number of log entries between Raft snapshots")
170 172
 }
171 173
 
172
-func (opts *swarmOptions) ToSpec(flags *pflag.FlagSet) swarm.Spec {
173
-	spec := swarm.Spec{}
174
-
174
+func (opts *swarmOptions) mergeSwarmSpec(spec *swarm.Spec, flags *pflag.FlagSet) {
175 175
 	if flags.Changed(flagTaskHistoryLimit) {
176 176
 		spec.Orchestration.TaskHistoryRetentionLimit = &opts.taskHistoryLimit
177 177
 	}
... ...
@@ -184,5 +188,16 @@ func (opts *swarmOptions) ToSpec(flags *pflag.FlagSet) swarm.Spec {
184 184
 	if flags.Changed(flagExternalCA) {
185 185
 		spec.CAConfig.ExternalCAs = opts.externalCA.Value()
186 186
 	}
187
+	if flags.Changed(flagMaxSnapshots) {
188
+		spec.Raft.KeepOldSnapshots = &opts.maxSnapshots
189
+	}
190
+	if flags.Changed(flagSnapshotInterval) {
191
+		spec.Raft.SnapshotInterval = opts.snapshotInterval
192
+	}
193
+}
194
+
195
+func (opts *swarmOptions) ToSpec(flags *pflag.FlagSet) swarm.Spec {
196
+	var spec swarm.Spec
197
+	opts.mergeSwarmSpec(&spec, flags)
187 198
 	return spec
188 199
 }
... ...
@@ -39,10 +39,7 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, opts swarmOpt
39 39
 		return err
40 40
 	}
41 41
 
42
-	err = mergeSwarm(&swarm, flags)
43
-	if err != nil {
44
-		return err
45
-	}
42
+	opts.mergeSwarmSpec(&swarm.Spec, flags)
46 43
 
47 44
 	err = client.SwarmUpdate(ctx, swarm.Version, swarm.Spec, updateFlags)
48 45
 	if err != nil {
... ...
@@ -53,31 +50,3 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, opts swarmOpt
53 53
 
54 54
 	return nil
55 55
 }
56
-
57
-func mergeSwarm(swarm *swarm.Swarm, flags *pflag.FlagSet) error {
58
-	spec := &swarm.Spec
59
-
60
-	if flags.Changed(flagTaskHistoryLimit) {
61
-		taskHistoryRetentionLimit, _ := flags.GetInt64(flagTaskHistoryLimit)
62
-		spec.Orchestration.TaskHistoryRetentionLimit = &taskHistoryRetentionLimit
63
-	}
64
-
65
-	if flags.Changed(flagDispatcherHeartbeat) {
66
-		if v, err := flags.GetDuration(flagDispatcherHeartbeat); err == nil {
67
-			spec.Dispatcher.HeartbeatPeriod = v
68
-		}
69
-	}
70
-
71
-	if flags.Changed(flagCertExpiry) {
72
-		if v, err := flags.GetDuration(flagCertExpiry); err == nil {
73
-			spec.CAConfig.NodeCertExpiry = v
74
-		}
75
-	}
76
-
77
-	if flags.Changed(flagExternalCA) {
78
-		value := flags.Lookup(flagExternalCA).Value.(*ExternalCAOption)
79
-		spec.CAConfig.ExternalCAs = value.Value()
80
-	}
81
-
82
-	return nil
83
-}
... ...
@@ -114,6 +114,9 @@ func prettyPrintInfo(dockerCli *command.DockerCli, info types.Info) error {
114 114
 			fmt.Fprintf(dockerCli.Out(), "  Task History Retention Limit: %d\n", taskHistoryRetentionLimit)
115 115
 			fmt.Fprintf(dockerCli.Out(), " Raft:\n")
116 116
 			fmt.Fprintf(dockerCli.Out(), "  Snapshot Interval: %d\n", info.Swarm.Cluster.Spec.Raft.SnapshotInterval)
117
+			if info.Swarm.Cluster.Spec.Raft.KeepOldSnapshots != nil {
118
+				fmt.Fprintf(dockerCli.Out(), "  Number of Old Snapshots to Retain: %d\n", *info.Swarm.Cluster.Spec.Raft.KeepOldSnapshots)
119
+			}
117 120
 			fmt.Fprintf(dockerCli.Out(), "  Heartbeat Tick: %d\n", info.Swarm.Cluster.Spec.Raft.HeartbeatTick)
118 121
 			fmt.Fprintf(dockerCli.Out(), "  Election Tick: %d\n", info.Swarm.Cluster.Spec.Raft.ElectionTick)
119 122
 			fmt.Fprintf(dockerCli.Out(), " Dispatcher:\n")
... ...
@@ -2841,14 +2841,14 @@ _docker_swarm_leave() {
2841 2841
 
2842 2842
 _docker_swarm_update() {
2843 2843
 	case "$prev" in
2844
-		--cert-expiry|--dispatcher-heartbeat|--task-history-limit)
2844
+		--cert-expiry|--dispatcher-heartbeat|--max-snapshots|--snapshot-interval|--task-history-limit)
2845 2845
 			return
2846 2846
 			;;
2847 2847
 	esac
2848 2848
 
2849 2849
 	case "$cur" in
2850 2850
 		-*)
2851
-			COMPREPLY=( $( compgen -W "--cert-expiry --dispatcher-heartbeat --help --task-history-limit" -- "$cur" ) )
2851
+			COMPREPLY=( $( compgen -W "--cert-expiry --dispatcher-heartbeat --help --max-snapshots --snapshot-interval --task-history-limit" -- "$cur" ) )
2852 2852
 			;;
2853 2853
 	esac
2854 2854
 }
... ...
@@ -1630,7 +1630,10 @@ __docker_swarm_subcommand() {
1630 1630
                 "($help)--advertise-addr[Advertised address]:ip\:port: " \
1631 1631
                 "($help)*--external-ca=[Specifications of one or more certificate signing endpoints]:endpoint: " \
1632 1632
                 "($help)--force-new-cluster[Force create a new cluster from current state]" \
1633
-                "($help)--listen-addr=[Listen address]:ip\:port: " && ret=0
1633
+                "($help)--listen-addr=[Listen address]:ip\:port: " \
1634
+                "($help)--max-snapshots[Number of additional Raft snapshots to retain]" \
1635
+                "($help)--snapshot-interval[Number of log entries between Raft snapshots]" \
1636
+                "($help)--task-history-limit=[Task history retention limit]:limit: " && ret=0
1634 1637
             ;;
1635 1638
         (join)
1636 1639
             _arguments $(__docker_arguments) \
... ...
@@ -1655,7 +1658,10 @@ __docker_swarm_subcommand() {
1655 1655
             _arguments $(__docker_arguments) \
1656 1656
                 $opts_help \
1657 1657
                 "($help)--cert-expiry=[Validity period for node certificates]:duration: " \
1658
+                "($help)*--external-ca=[Specifications of one or more certificate signing endpoints]:endpoint: " \
1658 1659
                 "($help)--dispatcher-heartbeat=[Dispatcher heartbeat period]:duration: " \
1660
+                "($help)--max-snapshots[Number of additional Raft snapshots to retain]" \
1661
+                "($help)--snapshot-interval[Number of log entries between Raft snapshots]" \
1659 1662
                 "($help)--task-history-limit=[Task history retention limit]:limit: " && ret=0
1660 1663
             ;;
1661 1664
         (help)
... ...
@@ -21,7 +21,7 @@ func SwarmFromGRPC(c swarmapi.Cluster) types.Swarm {
21 21
 				},
22 22
 				Raft: types.RaftConfig{
23 23
 					SnapshotInterval:           c.Spec.Raft.SnapshotInterval,
24
-					KeepOldSnapshots:           c.Spec.Raft.KeepOldSnapshots,
24
+					KeepOldSnapshots:           &c.Spec.Raft.KeepOldSnapshots,
25 25
 					LogEntriesForSlowFollowers: c.Spec.Raft.LogEntriesForSlowFollowers,
26 26
 					HeartbeatTick:              int(c.Spec.Raft.HeartbeatTick),
27 27
 					ElectionTick:               int(c.Spec.Raft.ElectionTick),
... ...
@@ -82,8 +82,8 @@ func MergeSwarmSpecToGRPC(s types.Spec, spec swarmapi.ClusterSpec) (swarmapi.Clu
82 82
 	if s.Raft.SnapshotInterval != 0 {
83 83
 		spec.Raft.SnapshotInterval = s.Raft.SnapshotInterval
84 84
 	}
85
-	if s.Raft.KeepOldSnapshots != 0 {
86
-		spec.Raft.KeepOldSnapshots = s.Raft.KeepOldSnapshots
85
+	if s.Raft.KeepOldSnapshots != nil {
86
+		spec.Raft.KeepOldSnapshots = *s.Raft.KeepOldSnapshots
87 87
 	}
88 88
 	if s.Raft.LogEntriesForSlowFollowers != 0 {
89 89
 		spec.Raft.LogEntriesForSlowFollowers = s.Raft.LogEntriesForSlowFollowers
... ...
@@ -28,6 +28,8 @@ Options:
28 28
       --force-new-cluster               Force create a new cluster from current state
29 29
       --help                            Print usage
30 30
       --listen-addr value               Listen address (format: <ip|interface>[:port])
31
+      --max-snapshots int               Number of additional Raft snapshots to retain
32
+      --snapshot-interval int           Number of log entries between Raft snapshots
31 33
       --task-history-limit int          Task history retention limit (default 5)
32 34
 ```
33 35
 
... ...
@@ -64,7 +66,7 @@ This flag sets the validity period for node certificates.
64 64
 This flags sets the frequency with which nodes are told to use as a
65 65
 period to report their health.
66 66
 
67
-### `--external-ca value`
67
+### `--external-ca`
68 68
 
69 69
 This flag sets up the swarm to use an external CA to issue node certificates. The value takes
70 70
 the form `protocol=X,url=Y`. The value for `protocol` specifies what protocol should be used
... ...
@@ -75,7 +77,7 @@ The URL specifies the endpoint where signing requests should be submitted.
75 75
 
76 76
 This flag forces an existing node that was part of a quorum that was lost to restart as a single node Manager without losing its data.
77 77
 
78
-### `--listen-addr value`
78
+### `--listen-addr`
79 79
 
80 80
 The node listens for inbound swarm manager traffic on this address. The default is to listen on
81 81
 0.0.0.0:2377. It is also possible to specify a network interface to listen on that interface's
... ...
@@ -84,7 +86,7 @@ address; for example `--listen-addr eth0:2377`.
84 84
 Specifying a port is optional. If the value is a bare IP address or interface
85 85
 name, the default port 2377 will be used.
86 86
 
87
-### `--advertise-addr value`
87
+### `--advertise-addr`
88 88
 
89 89
 This flag specifies the address that will be advertised to other members of the
90 90
 swarm for API access and overlay networking. If unspecified, Docker will check
... ...
@@ -103,6 +105,21 @@ name, the default port 2377 will be used.
103 103
 
104 104
 This flag sets up task history retention limit.
105 105
 
106
+### `--max-snapshots`
107
+
108
+This flag sets the number of old Raft snapshots to retain in addition to the
109
+current Raft snapshots. By default, no old snapshots are retained. This option
110
+may be used for debugging, or to store old snapshots of the swarm state for
111
+disaster recovery purposes.
112
+
113
+### `--snapshot-interval`
114
+
115
+This flag specifies how many log entries to allow in between Raft snapshots.
116
+Setting this to a higher number will trigger snapshots less frequently.
117
+Snapshots compact the Raft log and allow for more efficient transfer of the
118
+state to new managers. However, there is a performance cost to taking snapshots
119
+frequently.
120
+
106 121
 ## Related information
107 122
 
108 123
 * [swarm join](swarm_join.md)
... ...
@@ -25,6 +25,8 @@ Options:
25 25
       --dispatcher-heartbeat duration   Dispatcher heartbeat period (default 5s)
26 26
       --external-ca value               Specifications of one or more certificate signing endpoints
27 27
       --help                            Print usage
28
+      --max-snapshots int               Number of additional Raft snapshots to retain
29
+      --snapshot-interval int           Number of log entries between Raft snapshots
28 30
       --task-history-limit int          Task history retention limit (default 5)
29 31
 ```
30 32