Browse code

Fail build after 30 minutes of errors.

Maciej Szulik authored on 2015/05/14 00:27:44
Showing 15 changed files
... ...
@@ -49,10 +49,6 @@ func (bc *BuildController) HandleBuild(build *buildapi.Build) error {
49 49
 	}
50 50
 
51 51
 	if err := bc.nextBuildStatus(build); err != nil {
52
-		// TODO: all build errors should be retried, and build error should not be a permanent status change.
53
-		// Instead, we should requeue this build request using the same backoff logic as the scheduler.
54
-		//build.Status = buildapi.BuildStatusError
55
-		//build.Message = err.Error()
56 52
 		return fmt.Errorf("Build failed with error %s/%s: %v", build.Namespace, build.Name, err)
57 53
 	}
58 54
 
... ...
@@ -187,12 +183,12 @@ func (bc *BuildPodController) HandlePod(pod *kapi.Pod) error {
187 187
 		glog.V(4).Infof("Updating build %s status %s -> %s", build.Name, build.Status, nextStatus)
188 188
 		build.Status = nextStatus
189 189
 		if build.Status == buildapi.BuildStatusComplete || build.Status == buildapi.BuildStatusFailed || build.Status == buildapi.BuildStatusCancelled {
190
-			dummy := util.Now()
191
-			build.CompletionTimestamp = &dummy
190
+			now := util.Now()
191
+			build.CompletionTimestamp = &now
192 192
 		}
193 193
 		if build.Status == buildapi.BuildStatusRunning {
194
-			dummy := util.Now()
195
-			build.StartTimestamp = &dummy
194
+			now := util.Now()
195
+			build.StartTimestamp = &now
196 196
 		}
197 197
 		if err := bc.BuildUpdater.Update(build.Namespace, build); err != nil {
198 198
 			return fmt.Errorf("Failed to update build %s: %v", build.Name, err)
... ...
@@ -214,8 +210,8 @@ func (bc *BuildPodController) CancelBuild(build *buildapi.Build, pod *kapi.Pod)
214 214
 	}
215 215
 
216 216
 	build.Status = buildapi.BuildStatusCancelled
217
-	dummy := util.Now()
218
-	build.CompletionTimestamp = &dummy
217
+	now := util.Now()
218
+	build.CompletionTimestamp = &now
219 219
 	if err := bc.BuildUpdater.Update(build.Namespace, build); err != nil {
220 220
 		return err
221 221
 	}
... ...
@@ -7,6 +7,7 @@ import (
7 7
 	"github.com/golang/glog"
8 8
 
9 9
 	kapi "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
10
+	kerrors "github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
10 11
 	kclient "github.com/GoogleCloudPlatform/kubernetes/pkg/client"
11 12
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
12 13
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/record"
... ...
@@ -26,11 +27,27 @@ import (
26 26
 	imageapi "github.com/openshift/origin/pkg/image/api"
27 27
 )
28 28
 
29
-// limitedLogAndRetry stops retrying after 60 attempts.  Given the throttler configuration,
30
-// this means this event has been retried over a period of at least 50 seconds.
31
-func limitedLogAndRetry(obj interface{}, err error, count int) bool {
32
-	kutil.HandleError(err)
33
-	return count < 60
29
+const maxRetries = 60
30
+
31
+// limitedLogAndRetry stops retrying after maxTimeout, failing the build.
32
+func limitedLogAndRetry(buildupdater buildclient.BuildUpdater, maxTimeout time.Duration) controller.RetryFunc {
33
+	return func(obj interface{}, err error, retries controller.Retry) bool {
34
+		kutil.HandleError(err)
35
+		if time.Since(retries.StartTimestamp.Time) < maxTimeout {
36
+			return true
37
+		}
38
+		build := obj.(*buildapi.Build)
39
+		build.Status = buildapi.BuildStatusFailed
40
+		build.Message = err.Error()
41
+		now := kutil.Now()
42
+		build.CompletionTimestamp = &now
43
+		glog.V(3).Infof("Giving up retrying build %s/%s: %v", build.Namespace, build.Name, err)
44
+		if err := buildupdater.Update(build.Namespace, build); err != nil {
45
+			// retry update, but only on error other than NotFound
46
+			return !kerrors.IsNotFound(err)
47
+		}
48
+		return false
49
+	}
34 50
 }
35 51
 
36 52
 // BuildControllerFactory constructs BuildController objects
... ...
@@ -67,8 +84,12 @@ func (factory *BuildControllerFactory) Create() controller.RunnableController {
67 67
 	}
68 68
 
69 69
 	return &controller.RetryController{
70
-		Queue:        queue,
71
-		RetryManager: controller.NewQueueRetryManager(queue, cache.MetaNamespaceKeyFunc, limitedLogAndRetry, kutil.NewTokenBucketRateLimiter(1, 10)),
70
+		Queue: queue,
71
+		RetryManager: controller.NewQueueRetryManager(
72
+			queue,
73
+			cache.MetaNamespaceKeyFunc,
74
+			limitedLogAndRetry(factory.BuildUpdater, 30*time.Minute),
75
+			kutil.NewTokenBucketRateLimiter(1, 10)),
72 76
 		Handle: func(obj interface{}) error {
73 77
 			build := obj.(*buildapi.Build)
74 78
 			return buildController.HandleBuild(build)
... ...
@@ -111,8 +132,15 @@ func (factory *BuildPodControllerFactory) Create() controller.RunnableController
111 111
 	}
112 112
 
113 113
 	return &controller.RetryController{
114
-		Queue:        queue,
115
-		RetryManager: controller.NewQueueRetryManager(queue, cache.MetaNamespaceKeyFunc, limitedLogAndRetry, kutil.NewTokenBucketRateLimiter(1, 10)),
114
+		Queue: queue,
115
+		RetryManager: controller.NewQueueRetryManager(
116
+			queue,
117
+			cache.MetaNamespaceKeyFunc,
118
+			func(obj interface{}, err error, retries controller.Retry) bool {
119
+				kutil.HandleError(err)
120
+				return retries.Count < maxRetries
121
+			},
122
+			kutil.NewTokenBucketRateLimiter(1, 10)),
116 123
 		Handle: func(obj interface{}) error {
117 124
 			pod := obj.(*kapi.Pod)
118 125
 			return buildPodController.HandlePod(pod)
... ...
@@ -151,15 +179,12 @@ func (factory *ImageChangeControllerFactory) Create() controller.RunnableControl
151 151
 		RetryManager: controller.NewQueueRetryManager(
152 152
 			queue,
153 153
 			cache.MetaNamespaceKeyFunc,
154
-			func(obj interface{}, err error, count int) bool {
154
+			func(obj interface{}, err error, retries controller.Retry) bool {
155 155
 				kutil.HandleError(err)
156 156
 				if _, isFatal := err.(buildcontroller.ImageChangeControllerFatalError); isFatal {
157 157
 					return false
158 158
 				}
159
-				if count > 60 {
160
-					return false
161
-				}
162
-				return true
159
+				return retries.Count < maxRetries
163 160
 			},
164 161
 			kutil.NewTokenBucketRateLimiter(1, 10),
165 162
 		),
166 163
new file mode 100644
... ...
@@ -0,0 +1,58 @@
0
+package factory
1
+
2
+import (
3
+	"errors"
4
+	"strings"
5
+	"testing"
6
+	"time"
7
+
8
+	kutil "github.com/GoogleCloudPlatform/kubernetes/pkg/util"
9
+	buildapi "github.com/openshift/origin/pkg/build/api"
10
+	controller "github.com/openshift/origin/pkg/controller"
11
+)
12
+
13
+type buildUpdater struct {
14
+	Build *buildapi.Build
15
+}
16
+
17
+func (b *buildUpdater) Update(namespace string, build *buildapi.Build) error {
18
+	b.Build = build
19
+	return nil
20
+}
21
+
22
+func TestLimitedLogAndRetryFinish(t *testing.T) {
23
+	updater := &buildUpdater{}
24
+	err := errors.New("funky error")
25
+
26
+	now := kutil.Now()
27
+	retry := controller.Retry{0, kutil.Date(now.Year(), now.Month(), now.Day(), now.Hour(), now.Minute()-31, now.Second(), now.Nanosecond(), now.Location())}
28
+	if limitedLogAndRetry(updater, 30*time.Minute)(&buildapi.Build{Status: buildapi.BuildStatusNew}, err, retry) {
29
+		t.Error("Expected no more retries after reaching timeout!")
30
+	}
31
+	if updater.Build == nil {
32
+		t.Fatal("BuildUpdater wasn't called!")
33
+	}
34
+	if updater.Build.Status != buildapi.BuildStatusFailed {
35
+		t.Errorf("Expected status %s, got %s!", buildapi.BuildStatusFailed, updater.Build.Status)
36
+	}
37
+	if !strings.Contains(updater.Build.Message, err.Error()) {
38
+		t.Errorf("Expected message to contain %v, got %s!", err, updater.Build.Status)
39
+	}
40
+	if updater.Build.CompletionTimestamp == nil {
41
+		t.Error("Expected CompletionTimestamp to be set!")
42
+	}
43
+}
44
+
45
+func TestLimitedLogAndRetryProcessing(t *testing.T) {
46
+	updater := &buildUpdater{}
47
+	err := errors.New("funky error")
48
+
49
+	now := kutil.Now()
50
+	retry := controller.Retry{0, kutil.Date(now.Year(), now.Month(), now.Day(), now.Hour(), now.Minute()-10, now.Second(), now.Nanosecond(), now.Location())}
51
+	if !limitedLogAndRetry(updater, 30*time.Minute)(&buildapi.Build{Status: buildapi.BuildStatusNew}, err, retry) {
52
+		t.Error("Expected more retries!")
53
+	}
54
+	if updater.Build != nil {
55
+		t.Fatal("BuildUpdater shouldn't be called!")
56
+	}
57
+}
... ...
@@ -113,7 +113,11 @@ func (d *BuildDescriber) Describe(namespace, name string) (string, error) {
113 113
 		if build.Config != nil {
114 114
 			formatString(out, "Build Config", build.Config.Name)
115 115
 		}
116
-		formatString(out, "Status", bold(build.Status))
116
+		status := bold(build.Status)
117
+		if build.Status == buildapi.BuildStatusFailed {
118
+			status += " (" + build.Message + ")"
119
+		}
120
+		formatString(out, "Status", status)
117 121
 		if build.StartTimestamp != nil {
118 122
 			formatString(out, "Started", build.StartTimestamp.Time)
119 123
 		}
... ...
@@ -135,7 +139,10 @@ func (d *BuildDescriber) Describe(namespace, name string) (string, error) {
135 135
 
136 136
 func describeBuildDuration(build *buildapi.Build) string {
137 137
 	t := util.Now().Rfc3339Copy()
138
-	if build.StartTimestamp == nil && build.Status == buildapi.BuildStatusCancelled {
138
+	if build.StartTimestamp == nil &&
139
+		(build.Status == buildapi.BuildStatusCancelled ||
140
+			build.Status == buildapi.BuildStatusFailed ||
141
+			build.Status == buildapi.BuildStatusError) {
139 142
 		// time a build waited for its pod before ultimately being canceled before that pod was created
140 143
 		return fmt.Sprintf("waited for %s", build.CompletionTimestamp.Rfc3339Copy().Time.Sub(build.CreationTimestamp.Rfc3339Copy().Time))
141 144
 	} else if build.StartTimestamp == nil && build.Status != buildapi.BuildStatusCancelled {
... ...
@@ -222,7 +222,7 @@ func TestDescribeBuildDuration(t *testing.T) {
222 222
 			},
223 223
 			"1m0s",
224 224
 		},
225
-		{ // 5 - build cancelled before running, start time wasn't set yet
225
+		{ // 6 - build cancelled before running, start time wasn't set yet
226 226
 			&buildapi.Build{
227 227
 				ObjectMeta:          kapi.ObjectMeta{CreationTimestamp: creation},
228 228
 				CompletionTimestamp: &completion,
... ...
@@ -231,7 +231,7 @@ func TestDescribeBuildDuration(t *testing.T) {
231 231
 			},
232 232
 			"waited for 2m0s",
233 233
 		},
234
-		{ // 5 - build cancelled while running, start time is set already
234
+		{ // 7 - build cancelled while running, start time is set already
235 235
 			&buildapi.Build{
236 236
 				ObjectMeta:          kapi.ObjectMeta{CreationTimestamp: creation},
237 237
 				StartTimestamp:      &start,
... ...
@@ -241,6 +241,24 @@ func TestDescribeBuildDuration(t *testing.T) {
241 241
 			},
242 242
 			"1m0s",
243 243
 		},
244
+		{ // 8 - build failed before running, start time wasn't set yet
245
+			&buildapi.Build{
246
+				ObjectMeta:          kapi.ObjectMeta{CreationTimestamp: creation},
247
+				CompletionTimestamp: &completion,
248
+				Status:              buildapi.BuildStatusFailed,
249
+				Duration:            duration,
250
+			},
251
+			"waited for 2m0s",
252
+		},
253
+		{ // 9 - build error before running, start time wasn't set yet
254
+			&buildapi.Build{
255
+				ObjectMeta:          kapi.ObjectMeta{CreationTimestamp: creation},
256
+				CompletionTimestamp: &completion,
257
+				Status:              buildapi.BuildStatusError,
258
+				Duration:            duration,
259
+			},
260
+			"waited for 2m0s",
261
+		},
244 262
 	}
245 263
 
246 264
 	for i, tc := range tests {
... ...
@@ -69,7 +69,7 @@ type RetryManager interface {
69 69
 
70 70
 // RetryFunc should return true if the given object and error should be retried after
71 71
 // the provided number of times.
72
-type RetryFunc func(obj interface{}, err error, retries int) bool
72
+type RetryFunc func(obj interface{}, err error, retries Retry) bool
73 73
 
74 74
 // QueueRetryManager retries a resource by re-queueing it into a ReQueue as long as
75 75
 // retryFunc returns true.
... ...
@@ -83,14 +83,23 @@ type QueueRetryManager struct {
83 83
 	// retryFunc returns true if the resource and error returned should be retried.
84 84
 	retryFunc RetryFunc
85 85
 
86
-	// retries maps resources to their current retry count.
87
-	retries map[string]int
86
+	// retries maps resources to their current retry
87
+	retries map[string]Retry
88 88
 
89 89
 	// limits how fast retries can be enqueued to ensure you can't tight
90 90
 	// loop on retries.
91 91
 	limiter kutil.RateLimiter
92 92
 }
93 93
 
94
+// Retry describes provides additional information regarding retries.
95
+type Retry struct {
96
+	// Count is the number of retries
97
+	Count int
98
+
99
+	// StartTimestamp is retry start timestamp
100
+	StartTimestamp kutil.Time
101
+}
102
+
94 103
 // ReQueue is a queue that allows an object to be requeued
95 104
 type ReQueue interface {
96 105
 	Queue
... ...
@@ -103,7 +112,7 @@ func NewQueueRetryManager(queue ReQueue, keyFn kcache.KeyFunc, retryFn RetryFunc
103 103
 		queue:     queue,
104 104
 		keyFunc:   keyFn,
105 105
 		retryFunc: retryFn,
106
-		retries:   make(map[string]int),
106
+		retries:   make(map[string]Retry),
107 107
 		limiter:   limiter,
108 108
 	}
109 109
 }
... ...
@@ -115,7 +124,7 @@ func (r *QueueRetryManager) Retry(resource interface{}, err error) {
115 115
 	id, _ := r.keyFunc(resource)
116 116
 
117 117
 	if _, exists := r.retries[id]; !exists {
118
-		r.retries[id] = 0
118
+		r.retries[id] = Retry{0, kutil.Now()}
119 119
 	}
120 120
 	tries := r.retries[id]
121 121
 
... ...
@@ -124,7 +133,8 @@ func (r *QueueRetryManager) Retry(resource interface{}, err error) {
124 124
 		// It's important to use AddIfNotPresent to prevent overwriting newer
125 125
 		// state in the queue which may have arrived asynchronously.
126 126
 		r.queue.AddIfNotPresent(resource)
127
-		r.retries[id] = tries + 1
127
+		tries.Count = tries.Count + 1
128
+		r.retries[id] = tries
128 129
 	} else {
129 130
 		r.Forget(resource)
130 131
 	}
... ...
@@ -80,13 +80,10 @@ func TestQueueRetryManager_retries(t *testing.T) {
80 80
 		keyFunc: func(obj interface{}) (string, error) {
81 81
 			return obj.(testObj).id, nil
82 82
 		},
83
-		retryFunc: func(obj interface{}, err error, count int) bool {
84
-			if count > 4 {
85
-				return false
86
-			}
87
-			return true
83
+		retryFunc: func(obj interface{}, err error, r Retry) bool {
84
+			return r.Count < 5 && !r.StartTimestamp.IsZero()
88 85
 		},
89
-		retries: make(map[string]int),
86
+		retries: make(map[string]Retry),
90 87
 		limiter: kutil.NewTokenBucketRateLimiter(1000, 1000),
91 88
 	}
92 89
 
... ...
@@ -131,7 +128,7 @@ func TestRetryController_realFifoEventOrdering(t *testing.T) {
131 131
 
132 132
 	controller := &RetryController{
133 133
 		Queue:        fifo,
134
-		RetryManager: NewQueueRetryManager(fifo, keyFunc, func(_ interface{}, _ error, _ int) bool { return true }, kutil.NewTokenBucketRateLimiter(1000, 10)),
134
+		RetryManager: NewQueueRetryManager(fifo, keyFunc, func(_ interface{}, _ error, _ Retry) bool { return true }, kutil.NewTokenBucketRateLimiter(1000, 10)),
135 135
 		Handle: func(obj interface{}) error {
136 136
 			if e, a := 1, obj.(testObj).value; e != a {
137 137
 				t.Fatalf("expected to handle test value %d, got %d", e, a)
... ...
@@ -170,11 +167,8 @@ func TestRetryController_ratelimit(t *testing.T) {
170 170
 	limiter := &mockLimiter{}
171 171
 	retryManager := NewQueueRetryManager(fifo,
172 172
 		keyFunc,
173
-		func(_ interface{}, _ error, c int) bool {
174
-			if c < 15 {
175
-				return true
176
-			}
177
-			return false
173
+		func(_ interface{}, _ error, r Retry) bool {
174
+			return r.Count < 15
178 175
 		},
179 176
 		limiter,
180 177
 	)
... ...
@@ -64,12 +64,12 @@ func (factory *DeploymentConfigChangeControllerFactory) Create() controller.Runn
64 64
 		RetryManager: controller.NewQueueRetryManager(
65 65
 			queue,
66 66
 			cache.MetaNamespaceKeyFunc,
67
-			func(obj interface{}, err error, count int) bool {
67
+			func(obj interface{}, err error, retries controller.Retry) bool {
68 68
 				kutil.HandleError(err)
69 69
 				if _, isFatal := err.(fatalError); isFatal {
70 70
 					return false
71 71
 				}
72
-				if count > 0 {
72
+				if retries.Count > 0 {
73 73
 					return false
74 74
 				}
75 75
 				return true
... ...
@@ -72,7 +72,7 @@ func (factory *DeployerPodControllerFactory) Create() controller.RunnableControl
72 72
 		RetryManager: controller.NewQueueRetryManager(
73 73
 			podQueue,
74 74
 			cache.MetaNamespaceKeyFunc,
75
-			func(obj interface{}, err error, count int) bool { return count < 1 },
75
+			func(obj interface{}, err error, retries controller.Retry) bool { return retries.Count < 1 },
76 76
 			kutil.NewTokenBucketRateLimiter(1, 10),
77 77
 		),
78 78
 		Handle: func(obj interface{}) error {
... ...
@@ -84,12 +84,12 @@ func (factory *DeploymentControllerFactory) Create() controller.RunnableControll
84 84
 		RetryManager: controller.NewQueueRetryManager(
85 85
 			deploymentQueue,
86 86
 			cache.MetaNamespaceKeyFunc,
87
-			func(obj interface{}, err error, count int) bool {
87
+			func(obj interface{}, err error, retries controller.Retry) bool {
88 88
 				if _, isFatal := err.(fatalError); isFatal {
89 89
 					kutil.HandleError(err)
90 90
 					return false
91 91
 				}
92
-				if count > 1 {
92
+				if retries.Count > 1 {
93 93
 					return false
94 94
 				}
95 95
 				return true
... ...
@@ -61,7 +61,7 @@ func (factory *DeploymentCancellationControllerFactory) Create() controller.Runn
61 61
 		RetryManager: controller.NewQueueRetryManager(
62 62
 			deploymentQueue,
63 63
 			cache.MetaNamespaceKeyFunc,
64
-			func(obj interface{}, err error, count int) bool { return count < 1 },
64
+			func(obj interface{}, err error, retries controller.Retry) bool { return retries.Count < 1 },
65 65
 			kutil.NewTokenBucketRateLimiter(1, 10),
66 66
 		),
67 67
 		Handle: func(obj interface{}) error {
... ...
@@ -74,7 +74,7 @@ func (factory *DeploymentConfigControllerFactory) Create() controller.RunnableCo
74 74
 		RetryManager: controller.NewQueueRetryManager(
75 75
 			queue,
76 76
 			cache.MetaNamespaceKeyFunc,
77
-			func(obj interface{}, err error, count int) bool {
77
+			func(obj interface{}, err error, retries controller.Retry) bool {
78 78
 				kutil.HandleError(err)
79 79
 				// no retries for a fatal error
80 80
 				if _, isFatal := err.(fatalError); isFatal {
... ...
@@ -85,7 +85,7 @@ func (factory *DeploymentConfigControllerFactory) Create() controller.RunnableCo
85 85
 					return true
86 86
 				}
87 87
 				// no retries for anything else
88
-				if count > 0 {
88
+				if retries.Count > 0 {
89 89
 					return false
90 90
 				}
91 91
 				return true
... ...
@@ -73,12 +73,12 @@ func (factory *ImageChangeControllerFactory) Create() controller.RunnableControl
73 73
 		RetryManager: controller.NewQueueRetryManager(
74 74
 			queue,
75 75
 			cache.MetaNamespaceKeyFunc,
76
-			func(obj interface{}, err error, count int) bool {
76
+			func(obj interface{}, err error, retries controller.Retry) bool {
77 77
 				kutil.HandleError(err)
78 78
 				if _, isFatal := err.(fatalError); isFatal {
79 79
 					return false
80 80
 				}
81
-				if count > 0 {
81
+				if retries.Count > 0 {
82 82
 					return false
83 83
 				}
84 84
 				return true
... ...
@@ -47,9 +47,9 @@ func (f *ImportControllerFactory) Create() controller.RunnableController {
47 47
 		RetryManager: controller.NewQueueRetryManager(
48 48
 			q,
49 49
 			cache.MetaNamespaceKeyFunc,
50
-			func(obj interface{}, err error, count int) bool {
50
+			func(obj interface{}, err error, retries controller.Retry) bool {
51 51
 				util.HandleError(err)
52
-				return count < 5
52
+				return retries.Count < 5
53 53
 			},
54 54
 			kutil.NewTokenBucketRateLimiter(1, 10),
55 55
 		),
... ...
@@ -46,12 +46,12 @@ func (factory *NamespaceControllerFactory) Create() controller.RunnableControlle
46 46
 		RetryManager: controller.NewQueueRetryManager(
47 47
 			queue,
48 48
 			cache.MetaNamespaceKeyFunc,
49
-			func(obj interface{}, err error, count int) bool {
49
+			func(obj interface{}, err error, retries controller.Retry) bool {
50 50
 				kutil.HandleError(err)
51 51
 				if _, isFatal := err.(fatalError); isFatal {
52 52
 					return false
53 53
 				}
54
-				if count > 0 {
54
+				if retries.Count > 0 {
55 55
 					return false
56 56
 				}
57 57
 				return true