Browse code

Scan for selinux write error in registry diagnostic.

In the cluster admin diagnostic which scans the registry pod logs, add an
explicit check for an error which indicates the registry cannot write to disk,
which is a strong indicator for an selinux problem. The fix from the origin
troubleshooting page is then suggested.

We also watch for a successful write log entry, and if found *after* the error,
we know the problem has since been fixed and the diagnostic error can be safely
ignored.

Devan Goodwin authored on 2015/10/06 20:54:35
Showing 1 changed files
... ...
@@ -4,6 +4,7 @@ import (
4 4
 	"bufio"
5 5
 	"fmt"
6 6
 	"reflect"
7
+	"regexp"
7 8
 	"strings"
8 9
 
9 10
 	kapi "k8s.io/kubernetes/pkg/api"
... ...
@@ -92,6 +93,21 @@ Please examine the log entries to determine if there might be
92 92
 any related problems:
93 93
 %s`
94 94
 
95
+	clRegSelinuxErr = `
96
+The pod logs for the "%s" pod belonging to
97
+the "%s" service indicated the registry is unable to write to disk.
98
+This may indicate an SELinux denial, or problems with volume
99
+ownership/permissions.
100
+
101
+For volume permission problems please consult the Persistent Storage section
102
+of the Administrator's Guide.
103
+
104
+In the case of SELinux this may be resolved on the node by running:
105
+
106
+    sudo chcon -R -t svirt_sandbox_file_t [PATH_TO]/openshift.local.volumes
107
+
108
+%s`
109
+
95 110
 	clRegNoEP = `
96 111
 The "%[1]s" service exists with %d associated pod(s), but there
97 112
 are %d endpoints in the "%[1]s" service.
... ...
@@ -221,8 +237,15 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes
221 221
 	}
222 222
 	defer readCloser.Close()
223 223
 
224
+	// Indicator that selinux is blocking the registry from writing to disk:
225
+	selinuxErrorRegex, _ := regexp.Compile(".*level=error.*mkdir.*permission denied.*")
226
+	// If seen after the above error regex, we know the problem has since been fixed:
227
+	selinuxSuccessRegex, _ := regexp.Compile(".*level=info.*response completed.*http.request.method=PUT.*")
228
+
224 229
 	clientError := ""
225 230
 	registryError := ""
231
+	selinuxError := ""
232
+
226 233
 	scanner := bufio.NewScanner(readCloser)
227 234
 	for scanner.Scan() {
228 235
 		logLine := scanner.Text()
... ...
@@ -230,6 +253,12 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes
230 230
 		// https://github.com/kubernetes/kubernetes/issues/12447
231 231
 		if strings.Contains(logLine, `level=error msg="client error:`) {
232 232
 			clientError = logLine // end up showing only the most recent client error
233
+		} else if selinuxErrorRegex.MatchString(logLine) {
234
+			selinuxError = logLine
235
+		} else if selinuxSuccessRegex.MatchString(logLine) {
236
+			// Check for a successful registry push, if this occurs after a selinux error
237
+			// we can safely clear it, the problem has already been fixed.
238
+			selinuxError = ""
233 239
 		} else if strings.Contains(logLine, "level=error msg=") {
234 240
 			registryError += "\n" + logLine // gather generic errors
235 241
 		}
... ...
@@ -237,10 +266,12 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes
237 237
 	if clientError != "" {
238 238
 		r.Error("DClu1011", nil, fmt.Sprintf(clRegPodConn, pod.ObjectMeta.Name, registryName, clientError))
239 239
 	}
240
+	if selinuxError != "" {
241
+		r.Error("DClu1020", nil, fmt.Sprintf(clRegSelinuxErr, pod.ObjectMeta.Name, registryName, selinuxError))
242
+	}
240 243
 	if registryError != "" {
241 244
 		r.Warn("DClu1012", nil, fmt.Sprintf(clRegPodErr, pod.ObjectMeta.Name, registryName, registryError))
242 245
 	}
243
-
244 246
 }
245 247
 
246 248
 func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.DiagnosticResult) bool {