GitList

Browse code

diagnostics: revise per code reviews

fix SubjectAccessReview usage
fix 401/403 errs, cluster-context abort, controller origin offby1
fix readme
config_contexts - fix wording, remove cruft
simplify adminCan()
better registry error reporting
fix crufty journald log matches
remove meaning from message IDs

Luke Meyer authored on 2015/08/31 08:38:41
Showing 19 changed files

pkg/cmd/experimental/diagnostics/client.go index 40bd51b..35ba69b 100644
pkg/cmd/experimental/diagnostics/cluster.go index ec20b57..c737006 100644
pkg/cmd/experimental/diagnostics/config.go index e8ece38..75ee815 100644
pkg/cmd/experimental/diagnostics/diagnostics.go index 641817c..22b01fe 100644
pkg/diagnostics/README.md index 243377a..8a432c1 100644
pkg/diagnostics/client/config_contexts.go index 19a67d6..dcd91e0 100644
pkg/diagnostics/client/config_loading.go index 63033c4..3d6c6cf 100644
pkg/diagnostics/cluster/node_definitions.go index be90fd1..6314c18 100644
pkg/diagnostics/cluster/registry.go index 1a42661..04c61e5 100644
pkg/diagnostics/cluster/router.go index eb0507d..50b0b84 100644
pkg/diagnostics/cluster/util.go index 69796e4..fdc942f 100644
pkg/diagnostics/host/check_master_config.go index f8fe69d..8ef2f25 100644
pkg/diagnostics/host/check_node_config.go index 0a647f3..c7bd045 100644
pkg/diagnostics/log/log.go index 3ae4a21..4232748 100644
pkg/diagnostics/log/text.go index b125f96..7668055 100644
pkg/diagnostics/systemd/analyze_logs.go index 8a2e8cf..b531fea 100644
pkg/diagnostics/systemd/locate_units.go index 0513674..810ab8a 100644
pkg/diagnostics/systemd/systemd.go index 266e4de..e706513 100644
pkg/diagnostics/systemd/unit_status.go index 8fbd329..e2eca33 100644

@@ -24,7 +24,7 @@ func (o DiagnosticsOptions) buildClientDiagnostics(rawConfig *clientcmdapi.Confi
                      	// osClient, kubeClient, clientErr := o.Factory.Clients() // use with a diagnostic that needs OpenShift/Kube client
                      	_, _, clientErr := o.Factory.Clients()
                      	if clientErr != nil {
                     -		o.Logger.Notice("clLoadDefaultFailed", "Failed creating client from config; client diagnostics will be limited to config testing")
                     +		o.Logger.Notice("CED0001", "Failed creating client from config; client diagnostics will be limited to config testing")
                      		available = util.NewStringSet(clientdiags.ConfigContextsName)
+                     	}

@@ -2,6 +2,7 @@ package diagnostics
                      import (
                      	"fmt"
                     +	"regexp"
                      	"strings"
                      	kclient "k8s.io/kubernetes/pkg/client"
@@ -38,8 +39,8 @@ func (o DiagnosticsOptions) buildClusterDiagnostics(rawConfig *clientcmdapi.Conf
                      	clusterClient, kclusterClient, found, err := o.findClusterClients(rawConfig)
                      	if !found {
                     -		o.Logger.Notice("noClustCtx", "No cluster-admin client config found; skipping cluster diagnostics.")
                     -		return nil, false, err
                     +		o.Logger.Notice("CED1002", "No cluster-admin client config found; skipping cluster diagnostics.")
                     +		return nil, true, err
+                     	}
                      	diagnostics := []types.Diagnostic{}
@@ -64,7 +65,7 @@ func (o DiagnosticsOptions) findClusterClients(rawConfig *clientcmdapi.Config) (
                      	if o.ClientClusterContext != "" { // user has specified cluster context to use
                      		if context, exists := rawConfig.Contexts[o.ClientClusterContext]; exists {
                      			configErr := fmt.Errorf("Specified '%s' as cluster-admin context, but it was not found in your client configuration.", o.ClientClusterContext)
                     -			o.Logger.Error("discClustCtx", configErr.Error())
                     +			o.Logger.Error("CED1003", configErr.Error())
                      			return nil, nil, false, configErr
                      		} else if os, kube, found, err := o.makeClusterClients(rawConfig, o.ClientClusterContext, context); found {
                      			return os, kube, true, err
@@ -75,7 +76,7 @@ func (o DiagnosticsOptions) findClusterClients(rawConfig *clientcmdapi.Config) (
                      	currentContext, exists := rawConfig.Contexts[rawConfig.CurrentContext]
                      	if !exists { // config specified cluster admin context that doesn't exist; complain and quit
                      		configErr := fmt.Errorf("Current context '%s' not found in client configuration; will not attempt cluster diagnostics.", rawConfig.CurrentContext)
                     -		o.Logger.Errorf("discClustCtx", configErr.Error())
                     +		o.Logger.Errorf("CED1004", configErr.Error())
                      		return nil, nil, false, configErr
+                     	}
                      	// check if current context is already cluster admin
@@ -100,24 +101,29 @@ func (o DiagnosticsOptions) makeClusterClients(rawConfig *clientcmdapi.Config, c
                      	overrides := &clientcmd.ConfigOverrides{Context: *context}
                      	clientConfig := clientcmd.NewDefaultClientConfig(*rawConfig, overrides)
                      	factory := osclientcmd.NewFactory(clientConfig)
                     -	o.Logger.Debugf("discClustCtxStart", "Checking if context is cluster-admin: '%s'", contextName)
                     +	o.Logger.Debugf("CED1005", "Checking if context is cluster-admin: '%s'", contextName)
                      	if osClient, kubeClient, err := factory.Clients(); err != nil {
                     -		o.Logger.Debugf("discClustCtx", "Error creating client for context '%s':\n%v", contextName, err)
                     +		o.Logger.Debugf("CED1006", "Error creating client for context '%s':\n%v", contextName, err)
                      		return nil, nil, false, nil
                      	} else {
                     -		subjectAccessReview := authorizationapi.SubjectAccessReview{
                     -			// we assume if you can list nodes, you're the cluster admin.
                     -			Verb:     "list",
                     -			Resource: "nodes",
                     -		}
                     -		if resp, err := osClient.SubjectAccessReviews("default").Create(&subjectAccessReview); err != nil {
                     -			o.Logger.Errorf("discClustCtx", "Error testing cluster-admin access for context '%s':\n%v", contextName, err)
                     -			return nil, nil, false, err
                     +		subjectAccessReview := authorizationapi.SubjectAccessReview{Action: authorizationapi.AuthorizationAttributes{
                     +			// if you can do everything, you're the cluster admin.
                     +			Verb:     "*",
                     +			Resource: "*",
                     +		}}
                     +		if resp, err := osClient.SubjectAccessReviews().Create(&subjectAccessReview); err != nil {
                     +			if regexp.MustCompile(`User "[\w:]+" cannot create \w+ at the cluster scope`).MatchString(err.Error()) {
                     +				o.Logger.Debugf("CED1007", "Context '%s' does not have cluster-admin access:\n%v", contextName, err)
                     +				return nil, nil, false, nil
                     +			} else {
                     +				o.Logger.Errorf("CED1008", "Unknown error testing cluster-admin access for context '%s':\n%v", contextName, err)
                     +				return nil, nil, false, err
                     +			}
                      		} else if resp.Allowed {
                     -			o.Logger.Infof("discClustCtxFound", "Using context for cluster-admin access: '%s'", contextName)
                     +			o.Logger.Infof("CED1009", "Using context for cluster-admin access: '%s'", contextName)
                      			return osClient, kubeClient, true, nil
+                     		}
+                     	}
                     -	o.Logger.Debugf("discClustCtx", "Context does not have cluster-admin access: '%s'", contextName)
                     +	o.Logger.Debugf("CED1010", "Context does not have cluster-admin access: '%s'", contextName)
                      	return nil, nil, false, nil
+                     }

@@ -12,7 +12,7 @@ import (
                      // determine if we even have a client config
                      func (o DiagnosticsOptions) detectClientConfig() (bool, []types.DiagnosticError, []types.DiagnosticError) {
                      	diagnostic := &clientdiagnostics.ConfigLoading{ConfFlagName: config.OpenShiftConfigFlagName, ClientFlags: o.ClientFlags}
                     -	o.Logger.Noticet("diagRun", "Determining if client configuration exists for client/cluster diagnostics",
                     +	o.Logger.Noticet("CED2011", "Determining if client configuration exists for client/cluster diagnostics",
                      		log.Hash{"area": "client", "name": diagnostic.Name(), "diag": diagnostic.Description()})
                      	result := diagnostic.Check()
                      	for _, entry := range result.Logs() {

@@ -133,12 +133,12 @@ func (o DiagnosticsOptions) RunDiagnostics() (bool, error, int, int) {
                      	if len(o.RequestedDiagnostics) == 0 {
                      		o.RequestedDiagnostics = AvailableDiagnostics.List()
                      	} else if common := intersection(util.NewStringSet(o.RequestedDiagnostics...), AvailableDiagnostics); len(common) == 0 {
                     -		o.Logger.Errort("emptyReqDiag", "None of the requested diagnostics are available:\n  {{.requested}}\nPlease try from the following:\n  {{.available}}",
                     +		o.Logger.Errort("CED3012", "None of the requested diagnostics are available:\n  {{.requested}}\nPlease try from the following:\n  {{.available}}",
                      			log.Hash{"requested": o.RequestedDiagnostics, "available": AvailableDiagnostics.List()})
                      		return false, fmt.Errorf("No requested diagnostics available"), 0, 1
                      	} else if len(common) < len(o.RequestedDiagnostics) {
                      		errors = append(errors, fmt.Errorf("Not all requested diagnostics are available"))
                     -		o.Logger.Errort("notAllReqDiag", `
                     +		o.Logger.Errort("CED3013", `
                      Of the requested diagnostics:
                          {{.requested}}
                      only these are available:
@@ -163,13 +163,13 @@ The list of all possible is:
                      			errors = append(errors, err)
+                     		}
                      		if !detected { // there just plain isn't any client config file available
                     -			o.Logger.Notice("discNoClientConf", "No client configuration specified; skipping client and cluster diagnostics.")
                     +			o.Logger.Notice("CED3014", "No client configuration specified; skipping client and cluster diagnostics.")
                      		} else if rawConfig, err := o.buildRawConfig(); rawConfig == nil { // client config is totally broken - won't parse etc (problems may have been detected and logged)
                     -			o.Logger.Errorf("discBrokenClientConf", "Client configuration failed to load; skipping client and cluster diagnostics due to error: {{.error}}", log.Hash{"error": err.Error()})
                     +			o.Logger.Errorf("CED3015", "Client configuration failed to load; skipping client and cluster diagnostics due to error: {{.error}}", log.Hash{"error": err.Error()})
                      			errors = append(errors, err)
                      		} else {
                      			if err != nil { // error encountered, proceed with caution
                     -				o.Logger.Errorf("discClientConfErr", "Client configuration loading encountered an error, but proceeding anyway. Error was:\n{{.error}}", log.Hash{"error": err.Error()})
                     +				o.Logger.Errorf("CED3016", "Client configuration loading encountered an error, but proceeding anyway. Error was:\n{{.error}}", log.Hash{"error": err.Error()})
                      				errors = append(errors, err)
+                     			}
                      			clientDiags, ok, err := o.buildClientDiagnostics(rawConfig)
@@ -220,7 +220,7 @@ func (o DiagnosticsOptions) Run(diagnostics []types.Diagnostic) (bool, error, in
                      			defer func() {
                      				if r := recover(); r != nil {
                      					errorCount += 1
                     -					o.Logger.Errort("diagPanic",
                     +					o.Logger.Errort("CED3017",
                      						"While running the {{.name}} diagnostic, a panic was encountered.\nThis is a bug in diagnostics. Stack trace follows : \n{{.error}}",
                      						log.Hash{"name": diagnostic.Name(), "error": fmt.Sprintf("%v", r)})
+                     				}
@@ -228,16 +228,16 @@ func (o DiagnosticsOptions) Run(diagnostics []types.Diagnostic) (bool, error, in
                      			if canRun, reason := diagnostic.CanRun(); !canRun {
                      				if reason == nil {
                     -					o.Logger.Noticet("diagSkip", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}",
                     +					o.Logger.Noticet("CED3018", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}",
                      						log.Hash{"name": diagnostic.Name(), "diag": diagnostic.Description()})
                      				} else {
                     -					o.Logger.Noticet("diagSkip", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}\nBecause: {{.reason}}",
                     +					o.Logger.Noticet("CED3019", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}\nBecause: {{.reason}}",
                      						log.Hash{"name": diagnostic.Name(), "diag": diagnostic.Description(), "reason": reason.Error()})
+                     				}
                      				return
+                     			}
                     -			o.Logger.Noticet("diagRun", "Running diagnostic: {{.name}}\nDescription: {{.diag}}",
                     +			o.Logger.Noticet("CED3020", "Running diagnostic: {{.name}}\nDescription: {{.diag}}",
                      				log.Hash{"name": diagnostic.Name(), "diag": diagnostic.Description()})
                      			r := diagnostic.Check()
                      			for _, entry := range r.Logs() {

@@ -26,6 +26,19 @@ Diagnostics looks for config files in standard locations. If not found,
                      related diagnostics are just skipped. Non-standard locations can be
                      specified with flags.
                     +Standard config file locations are:
+                    +
                     +* Client:
                     +  * as indicated by --config flag
                     +  * as indicated by $KUBECONFIG env var
                     +  * ~/.kube/config file
                     +* Master:
                     +  * as indicated by --master-config flag
                     +  * /etc/openshift/master/master-config.yaml
                     +* Node:
                     +  * as indicated by --node-config flag
                     +  * /etc/openshift/node/node-config.yaml
+                    +
                      Host environment
                      ================
@@ -36,14 +49,14 @@ logic. This provides two major benefits:
                      * master/node configuration is based on a configuration file in a standard location
                      * all components log to journald
                     -Having configuration files in standard locations means you will generally
                     +Having configuration files where ansible places them means you will generally
                      not even need to specify where to find them. Running:
                          openshift ex diagnostics
                      by itself will look for master and node configs (in addition to client
                      config file) in the standard locations and use them if found; so this
                     -should make the Enterprise use case as simple as possible. It's also
                     +should make the ansible-installed use case as simple as possible. It's also
                      very easy to use configuration files when they are not in the expected
                      Enterprise locations:
@@ -53,7 +66,7 @@ Having logs in journald is necessary for the current log analysis
                      logic. Other usage may have logs going into files, output to stdout,
                      combined node/master... it may not be too hard to extend analysis to
                      other log sources but the priority has been to look at journald logs
                     -as created by components in Enterprise deployments (including docker,
                     +as created by components in systemd-based deployments (including docker,
                      openvswitch, etc.).
                      Client environment
@@ -97,17 +110,18 @@ paths or flooding them with non-issues that obscure real problems.
                      * Warnings indicate issues that may be a problem but could be valid for
                        some configurations / situations, for example a node being disabled.
                     -Enabling automation
                     -===================
                     +**Message IDs**
+                    +
                     +All messages should have a unique, unchanging, otherwise-meaningless
                     +message ID to facilitate the user greping for specific errors/warnings
                     +without having to depend on text that may change. Although nothing yet
                     +depends on them being unique, the message ID scheme attempts to ensure
                     +they are. That scheme is:
                     -Diagnostic messages are designed to be logged either for human consumption
                     -("text" format) or for scripting/automation ("yaml" or "json" formats). So
                     -messages should:
                     +    Initials of package + index of file in package + index of message in file
                     -* Have an ID that is unique and unchanging, such that automated alerts
                     -  could filter on specific IDs rather than rely on message text or level.
                     -* Log any data that might be relevant in an automated alert as
                     -  template data; for example, when a node is down, include the name of
                     -  the node so that automation could decide how important it is.
                     -* Not put anything in message template data that cannot be serialized.
                     +E.g. "DClu1001" is in package diagnostics/cluster (which needed to be
                     +differentiated from diagnostics/client), the first file indexed, and
                     +the first message in the file.  This format is not important; it's just
                     +a convenience to help keep IDs unique. But don't change existing IDs.

@@ -24,17 +24,8 @@ type ConfigContext struct {
+                     }
                      const (
                     -	ConfigContextsName    = "ConfigContexts"
                     -	currentContextMissing = `Your client config specifies a current context of '{{.context}}'
                     -which is not defined; it is likely that a mistake was introduced while
                     -manually editing your config. If this is a simple typo, you may be
                     -able to fix it manually.
                     -The master creates a fresh client config when it is started; it may be
                     -useful to use this as a base if available.`
                     +	ConfigContextsName = "ConfigContexts"
                     -	currentContextSummary = `The current context from client config is '{{.context}}'
                     -This will be used by default to contact the master API.
                     -`
                      	contextDesc = `
                      For client config context '{{.context}}':
                      The server URL is '{{.server}}'
@@ -93,12 +84,12 @@ fails in this case.
                      However, the most likely explanation is that the server certificate
                      needs to be updated to include the name you are using to reach it.
                     -If the master API server is generating its own certificates (which is
                     -default), then specify the public master address in the master-config.yaml
                     -or with the --public-master flag is usually the easiest way to do
                     -this. If you need something more complicated (for instance, multiple
                     -public addresses for the API, or your own CA), then you will need to
                     -custom-generate the server certificate with the right names yourself.
                     +If the master API server is generating its own certificates (which
                     +is the default), then specifying the public master address in the
                     +master-config.yaml or with the --public-master flag is usually the easiest
                     +way to do this. If you need something more complicated (for instance,
                     +multiple public addresses for the API, or your own CA), then you will need
                     +to custom-generate the server certificate with the right names yourself.
                      If you are unconcerned about any of this, you can add the
                      --insecure-skip-tls-verify flag to bypass secure (TLS) verification,
@@ -121,7 +112,7 @@ we could not reach the host at all.
                      * You may have specified the wrong host address.
                      * This could mean the host is completely unavailable (down).
                      * This could indicate a routing problem or a firewall that simply
                     -  drops requests rather than responding by reseting the connection.
                     +  drops requests rather than responding by resetting the connection.
                      * It does not generally mean that DNS name resolution failed (which
                        would be a different error) though the problem could be that it
                        gave the wrong address.`
@@ -155,9 +146,9 @@ key/certificate or an access token. Your kubeconfig may not have
                      presented any, or they may be invalid.`
                      	clientUnauthz = `
                      This means that when we tried to make a request to the master API
                     -server, the request required credentials that were not presented.
                     -This can happen when an authentication token expires. Try logging in
                     -with this user again.`
                     +server, the request required credentials that were not presented. This
                     +can happen with an expired or invalid authentication token. Try logging
                     +in with this user again.`
+                     )
                      var (
@@ -191,10 +182,10 @@ func (d ConfigContext) Check() types.DiagnosticResult {
                      	isDefaultContext := d.RawConfig.CurrentContext == d.ContextName
                      	// prepare bad news message
                     -	errorKey := "clientCfgError"
                     +	errorKey := "DCli0001"
                      	unusableLine := fmt.Sprintf("The client config context '%s' is unusable", d.ContextName)
                      	if isDefaultContext {
                     -		errorKey = "currentccError"
                     +		errorKey = "DCli0002"
                      		unusableLine = fmt.Sprintf("The current client config context '%s' is unusable", d.ContextName)
+                     	}
@@ -212,7 +203,7 @@ func (d ConfigContext) Check() types.DiagnosticResult {
+                     	}
                      	authName := context.AuthInfo
                      	if _, exists := d.RawConfig.AuthInfos[authName]; !exists {
                     -		r.Errorf(errorKey, nil, "%s:\n Client config context '%s' has a user identity '%s' which is not defined.", unusableLine, d.ContextName, authName)
                     +		r.Errorf(errorKey, nil, "%s:\n Client config context '%s' has a user '%s' which is not defined.", unusableLine, d.ContextName, authName)
                      		return r
+                     	}
@@ -230,7 +221,7 @@ func (d ConfigContext) Check() types.DiagnosticResult {
                      	// Actually send a request to see if context has connectivity.
                      	// Note: we cannot reuse factories as they cache the clients, so build new factory for each context.
                      	osClient, _, err := osclientcmd.NewFactory(kclientcmd.NewDefaultClientConfig(*d.RawConfig, &kclientcmd.ConfigOverrides{Context: *context})).Clients()
                     -	// client create now fails if cannot connect to server, so address connectivity errors below
                     +	// client create now *fails* if cannot connect to server; so, address connectivity errors below
                      	if err == nil {
                      		if projects, projerr := osClient.Projects().List(labels.Everything(), fields.Everything()); projerr != nil {
                      			err = projerr
@@ -245,9 +236,9 @@ func (d ConfigContext) Check() types.DiagnosticResult {
+                     			}
                      			msgData["projects"] = list
                      			if len(list) == 0 {
                     -				r.Infot("CCctxSuccess", msgText+"Successfully requested project list, but it is empty, so user has no access to anything.", msgData)
                     +				r.Infot("DCli0003", msgText+"Successfully requested project list, but it is empty, so user has no access to anything.", msgData)
                      			} else {
                     -				r.Infot("CCctxSuccess", msgText+"Successfully requested project list; has access to project(s):\n  {{.projects}}", msgData)
                     +				r.Infot("DCli0004", msgText+"Successfully requested project list; has access to project(s):\n  {{.projects}}", msgData)
+                     			}
                      			return r
+                     		}
@@ -260,29 +251,29 @@ func (d ConfigContext) Check() types.DiagnosticResult {
                      	var reason, errId string
                      	switch {
                      	case regexp.MustCompile("dial tcp: lookup (\\S+): no such host").MatchString(errMsg):
                     -		errId, reason = "clientNoResolve", clientNoResolve
                     +		errId, reason = "DCli0005", clientNoResolve
                      	case strings.Contains(errMsg, "x509: certificate signed by unknown authority"):
                     -		errId, reason = "clientUnknownCa", clientUnknownCa
                     +		errId, reason = "DCli0006", clientUnknownCa
                      	case strings.Contains(errMsg, "specifying a root certificates file with the insecure flag is not allowed"):
                     -		errId, reason = "clientUnneededCa", clientUnneededCa
                     +		errId, reason = "DCli0007", clientUnneededCa
                      	case invalidCertNameRx.MatchString(errMsg):
                      		match := invalidCertNameRx.FindStringSubmatch(errMsg)
                      		serverHost := match[len(match)-1]
                     -		errId, reason = "clientInvCertName", fmt.Sprintf(clientInvCertName, serverHost)
                     +		errId, reason = "DCli0008", fmt.Sprintf(clientInvCertName, serverHost)
                      	case regexp.MustCompile("dial tcp (\\S+): connection refused").MatchString(errMsg):
                     -		errId, reason = "clientConnRefused", clientConnRefused
                     +		errId, reason = "DCli0009", clientConnRefused
                      	case regexp.MustCompile("dial tcp (\\S+): (?:connection timed out|i/o timeout|no route to host)").MatchString(errMsg):
                     -		errId, reason = "clientConnTimeout", clientConnTimeout
                     +		errId, reason = "DCli0010", clientConnTimeout
                      	case strings.Contains(errMsg, "malformed HTTP response"):
                     -		errId, reason = "clientMalformedHTTP", clientMalformedHTTP
                     +		errId, reason = "DCli0011", clientMalformedHTTP
                      	case strings.Contains(errMsg, "tls: oversized record received with length"):
                     -		errId, reason = "clientMalformedTLS", clientMalformedTLS
                     -	case regexp.MustCompile(`403 Forbidden: Forbidden: "/osapi/v\w+/projects?namespace=" denied by default`).MatchString(errMsg):
                     -		errId, reason = "clientUnauthn", clientUnauthn
                     -	case regexp.MustCompile("401 Unauthorized: Unauthorized$").MatchString(errMsg):
                     -		errId, reason = "clientUnauthz", clientUnauthz
                     +		errId, reason = "DCli0012", clientMalformedTLS
                     +	case strings.Contains(errMsg, `User "system:anonymous" cannot`):
                     +		errId, reason = "DCli0013", clientUnauthn
                     +	case strings.Contains(errMsg, "provide credentials"):
                     +		errId, reason = "DCli0014", clientUnauthz
                      	default:
                     -		errId, reason = "clientUnknownConnErr", `Diagnostics does not have an explanation for what this means. Please report this error so one can be added.`
                     +		errId, reason = "DCli0015", `Diagnostics does not have an explanation for what this means. Please report this error so one can be added.`
+                     	}
                      	r.Errort(errId, err, msgText+"{{.errMsg}}\n"+reason, msgData)
                      	return r

@@ -65,7 +65,7 @@ func (d *ConfigLoading) Check() types.DiagnosticResult {
                      	if foundPath != "" {
                      		if confFlagValue != "" && confFlagValue != foundPath {
                      			// found config but not where --config said
                     -			r.Errorf("discCCnotFlag", nil, `
                     +			r.Errorf("DCli1001", nil, `
                      The client configuration file was not found where the --%s flag indicated:
                        %s
                      A config file was found at the following location:
@@ -76,7 +76,7 @@ with the --%[1]s flag, or just not specify the flag.
+                     		}
                      	} else { // not found, check for master-generated ones to recommend
                      		if confFlagValue != "" {
                     -			r.Errorf("discCCnotFlag", nil, "Did not find config file where --%s=%s indicated", d.ConfFlagName, confFlagValue)
                     +			r.Errorf("DCli1002", nil, "Did not find config file where --%s=%s indicated", d.ConfFlagName, confFlagValue)
+                     		}
                      		adminWarningF := `
                      No client config file was available; however, one exists at
@@ -98,7 +98,7 @@ location for use by the client and diagnostics.
                      		for _, path := range adminPaths {
                      			msg := fmt.Sprintf("Looking for a possible client config at %s\n", path)
                      			if d.canOpenConfigFile(path, msg, r) {
                     -				r.Warnf("discCCautoPath", nil, adminWarningF, config.OpenShiftConfigPathEnvVar, path, config.RecommendedHomeFile)
                     +				r.Warnf("DCli1003", nil, adminWarningF, config.OpenShiftConfigPathEnvVar, path, config.RecommendedHomeFile)
                      				break
+                     			}
+                     		}
@@ -115,28 +115,28 @@ func (d ConfigLoading) canOpenConfigFile(path string, errmsg string, r types.Dia
                      	if path == "" { // empty param/envvar
                      		return false
                      	} else if file, err = os.Open(path); err == nil {
                     -		r.Debugt("discOpenCC", "Reading client config at {{.path}}", log.Hash{"path": path})
                     +		r.Debugt("DCli1004", "Reading client config at {{.path}}", log.Hash{"path": path})
                      	} else if errmsg == "" {
                     -		r.Debugf("discOpenCCNo", "Could not read client config at %s:\n%#v", path, err)
                     +		r.Debugf("DCli1005", "Could not read client config at %s:\n%#v", path, err)
                      	} else if os.IsNotExist(err) {
                     -		r.Debug("discOpenCCNoExist", errmsg+"but that file does not exist.")
                     +		r.Debug("DCli1006", errmsg+"but that file does not exist.")
                      	} else if os.IsPermission(err) {
                     -		r.Error("discOpenCCNoPerm", err, errmsg+"but lack permission to read that file.")
                     +		r.Error("DCli1007", err, errmsg+"but lack permission to read that file.")
                      	} else {
                     -		r.Errorf("discOpenCCErr", err, "%sbut there was an error opening it:\n%#v", errmsg, err)
                     +		r.Errorf("DCli1008", err, "%sbut there was an error opening it:\n%#v", errmsg, err)
+                     	}
                      	if file != nil { // it is open for reading
                      		defer file.Close()
                      		if buffer, err := ioutil.ReadAll(file); err != nil {
                     -			r.Errorf("discCCReadErr", err, "Unexpected error while reading client config file (%s): %v", path, err)
                     +			r.Errorf("DCli1009", err, "Unexpected error while reading client config file (%s): %v", path, err)
                      		} else if _, err := clientcmd.Load(buffer); err != nil {
                     -			r.Errorf("discCCYamlErr", err, `
                     +			r.Errorf("DCli1010", err, `
                      Error reading YAML from client config file (%s):
                        %v
                      This file may have been truncated or mis-edited.
                      Please fix, remove, or obtain a new client config`, file.Name(), err)
                      		} else {
                     -			r.Infof("discCCRead", "Successfully read a client config file at '%s'", path)
                     +			r.Infof("DCli1011", "Successfully read a client config file at '%s'", path)
                      			/* Note, we're not going to use this config file directly.
                      			 * Instead, we'll defer to the openshift client code to assimilate
                      			 * flags, env vars, and the potential hierarchy of config files

@@ -67,7 +67,7 @@ func (d *NodeDefinitions) CanRun() (bool, error) {
                      	if d.KubeClient == nil || d.OsClient == nil {
                      		return false, errors.New("must have kube and os client")
+                     	}
                     -	can, err := adminCan(d.OsClient, kapi.NamespaceDefault, &authorizationapi.SubjectAccessReview{
                     +	can, err := adminCan(d.OsClient, authorizationapi.AuthorizationAttributes{
                      		Verb:     "list",
                      		Resource: "nodes",
                      	})
@@ -75,7 +75,7 @@ func (d *NodeDefinitions) CanRun() (bool, error) {
                      		msg := log.Message{ID: "clGetNodesFailed", EvaluatedText: fmt.Sprintf(clientErrorGettingNodes, err)}
                      		return false, types.DiagnosticError{msg.ID, &msg, err}
                      	} else if !can {
                     -		msg := log.Message{ID: "clGetNodesFailed", EvaluatedText: "Client does not have cluster-admin access and cannot see node records"}
                     +		msg := log.Message{ID: "clGetNodesFailed", EvaluatedText: "Client does not have access to see node status"}
                      		return false, types.DiagnosticError{msg.ID, &msg, err}
+                     	}
                      	return true, nil
@@ -86,7 +86,7 @@ func (d *NodeDefinitions) Check() types.DiagnosticResult {
                      	nodes, err := d.KubeClient.Nodes().List(labels.LabelSelector{}, fields.Everything())
                      	if err != nil {
                     -		r.Errorf("clGetNodesFailed", err, clientErrorGettingNodes, err)
                     +		r.Errorf("DClu0001", err, clientErrorGettingNodes, err)
                      		return r
+                     	}
@@ -110,15 +110,15 @@ func (d *NodeDefinitions) Check() types.DiagnosticResult {
                      				templateData["status"] = ready.Status
                      				templateData["reason"] = ready.Reason
+                     			}
                     -			r.Warnt("clNodeNotReady", nil, nodeNotReady, templateData)
                     +			r.Warnt("DClu0002", nil, nodeNotReady, templateData)
                      		} else if node.Spec.Unschedulable {
                     -			r.Warnt("clNodeNotSched", nil, nodeNotSched, log.Hash{"node": node.Name})
                     +			r.Warnt("DClu0003", nil, nodeNotSched, log.Hash{"node": node.Name})
                      		} else {
                      			anyNodesAvail = true
+                     		}
+                     	}
                      	if !anyNodesAvail {
                     -		r.Error("clNoAvailNodes", nil, "There were no nodes available to use. No new pods can be scheduled.")
                     +		r.Error("DClu0004", nil, "There were no nodes available to use. No new pods can be scheduled.")
+                     	}
                      	return r

@@ -4,7 +4,6 @@ import (
                      	"bufio"
                      	"fmt"
                      	"reflect"
                     -	"regexp"
                      	"strings"
                      	kapi "k8s.io/kubernetes/pkg/api"
@@ -80,12 +79,20 @@ succeeding but not triggering deployments (as they wait on notifications
                      to the ImageStream from the build).
                      There are many reasons for this step to fail, including invalid
                     -credentials, DNS failures, network errors, and so on. Examine the
                     -following error message from the registry pod logs to determine the
                     -problem:
                     +credentials, master outages, DNS failures, network errors, and so on. It
                     +can be temporary or ongoing. Check the most recent error message from the
                     +registry pod logs to determine the nature of the problem:
                      {{.log}}`
                     +	clRegPodErr = `
                     +The pod logs for the "{{.podName}}" pod belonging to
                     +the "{{.registryName}}" service indicated unknown errors.
                     +This could result in problems with builds or deployments.
                     +Please examine the log entries to determine if there might be
                     +any related problems:
                     +{{.log}}`
+                    +
                      	clRegNoEP = `
                      The "{{.registryName}}" service exists with {{.numPods}} associated pod(s), but there
                      are {{.numEP}} endpoints in the "{{.registryName}}" service.
@@ -134,7 +141,8 @@ func (d *ClusterRegistry) CanRun() (bool, error) {
                      	if d.OsClient == nil || d.KubeClient == nil {
                      		return false, fmt.Errorf("must have kube and os clients")
+                     	}
                     -	return adminCan(d.OsClient, kapi.NamespaceDefault, &authorizationapi.SubjectAccessReview{
                     +	return adminCan(d.OsClient, authorizationapi.AuthorizationAttributes{
                     +		Namespace:    kapi.NamespaceDefault,
                      		Verb:         "get",
                      		Resource:     "services",
                      		ResourceName: registryName,
@@ -146,7 +154,7 @@ func (d *ClusterRegistry) Check() types.DiagnosticResult {
                      	if service := d.getRegistryService(r); service != nil {
                      		// Check that it actually has pod(s) selected and running
                      		if runningPods := d.getRegistryPods(service, r); len(runningPods) == 0 {
                     -			r.Errorf("clRegNoRunningPods ", nil, clRegNoRunningPods, registryName)
                     +			r.Errorf("DClu1001", nil, clRegNoRunningPods, registryName)
                      			return r
                      		} else if d.checkRegistryEndpoints(runningPods, r) { // Check that matching endpoint exists on the service
                      			// attempt to create an imagestream and see if it gets the same registry service IP from the service cache
@@ -159,13 +167,13 @@ func (d *ClusterRegistry) Check() types.DiagnosticResult {
                      func (d *ClusterRegistry) getRegistryService(r types.DiagnosticResult) *kapi.Service {
                      	service, err := d.KubeClient.Services(kapi.NamespaceDefault).Get(registryName)
                      	if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
                     -		r.Warnf("clGetRegNone", err, clGetRegNone, registryName, kapi.NamespaceDefault)
                     +		r.Warnf("DClu1002", err, clGetRegNone, registryName, kapi.NamespaceDefault)
                      		return nil
                      	} else if err != nil {
                     -		r.Errorf("clGetRegFailed", err, clGetRegFailed, err)
                     +		r.Errorf("DClu1003", err, clGetRegFailed, err)
                      		return nil
+                     	}
                     -	r.Debugf("clRegFound", "Found %s service with ports %v", registryName, service.Spec.Ports)
                     +	r.Debugf("DClu1004", "Found %s service with ports %v", registryName, service.Spec.Ports)
                      	return service
+                     }
@@ -173,24 +181,24 @@ func (d *ClusterRegistry) getRegistryPods(service *kapi.Service, r types.Diagnos
                      	runningPods := []*kapi.Pod{}
                      	pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(labels.SelectorFromSet(service.Spec.Selector), fields.Everything())
                      	if err != nil {
                     -		r.Errorf("clRegListPods", err, "Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err)
                     +		r.Errorf("DClu1005", err, "Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err)
                      		return runningPods
                      	} else if len(pods.Items) < 1 {
                     -		r.Errorf("clRegNoPods", nil, clRegNoPods, registryName)
                     +		r.Errorf("DClu1006", nil, clRegNoPods, registryName)
                      		return runningPods
                      	} else if len(pods.Items) > 1 {
                      		// multiple registry pods using EmptyDir will be inconsistent
                      		for _, volume := range pods.Items[0].Spec.Volumes {
                      			if volume.Name == registryVolume && volume.EmptyDir != nil {
                     -				r.Errorf("clRegMultiPods", nil, clRegMultiPods, registryName)
                     +				r.Errorf("DClu1007", nil, clRegMultiPods, registryName)
                      				break
+                     			}
+                     		}
+                     	}
                      	for _, pod := range pods.Items {
                     -		r.Debugf("clRegPodFound", "Found %s pod with name %s", registryName, pod.ObjectMeta.Name)
                     +		r.Debugf("DClu1008", "Found %s pod with name %s", registryName, pod.ObjectMeta.Name)
                      		if pod.Status.Phase != kapi.PodRunning {
                     -			r.Warnf("clRegPodDown", nil, clRegPodDown, pod.ObjectMeta.Name, registryName)
                     +			r.Warnf("DClu1009", nil, clRegPodDown, pod.ObjectMeta.Name, registryName)
                      		} else {
                      			runningPods = append(runningPods, &pod)
                      			// Check the logs for that pod for common issues (credentials, DNS resolution failure)
@@ -209,7 +217,7 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes
                      		Param("container", pod.Spec.Containers[0].Name).
                      		Stream()
                      	if err != nil {
                     -		r.Warnt("clRegPodLog", nil, clRegPodLog, log.Hash{
                     +		r.Warnt("DClu1010", nil, clRegPodLog, log.Hash{
                      			"error":        fmt.Sprintf("(%T) %[1]v", err),
                      			"podName":      pod.ObjectMeta.Name,
                      			"registryName": registryName,
@@ -218,24 +226,40 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes
+                     	}
                      	defer readCloser.Close()
                     +	clientError := ""
                     +	registryError := ""
                      	scanner := bufio.NewScanner(readCloser)
                      	for scanner.Scan() {
                      		logLine := scanner.Text()
                     -		if regexp.MustCompile(`level=error msg="client error: Post http(\S+)/subjectaccessreviews`).MatchString(logLine) {
                     -			r.Errort("clRegPodConn", nil, clRegPodConn, log.Hash{
                     -				"log":          logLine,
                     -				"podName":      pod.ObjectMeta.Name,
                     -				"registryName": registryName,
                     -			})
                     -			break
                     +		// TODO: once the logging API gets "since" and "tail" and "limit", limit to more recent log entries
                     +		// https://github.com/kubernetes/kubernetes/issues/12447
                     +		if strings.Contains(logLine, `level=error msg="client error:`) {
                     +			clientError = logLine // end up showing only the most recent client error
                     +		} else if strings.Contains(logLine, "level=error msg=") {
                     +			registryError += "\n" + logLine // gather generic errors
+                     		}
+                     	}
                     +	if clientError != "" {
                     +		r.Errort("DClu1011", nil, clRegPodConn, log.Hash{
                     +			"log":          clientError,
                     +			"podName":      pod.ObjectMeta.Name,
                     +			"registryName": registryName,
                     +		})
                     +	}
                     +	if registryError != "" {
                     +		r.Warnt("DClu1012", nil, clRegPodErr, log.Hash{
                     +			"log":          registryError,
                     +			"podName":      pod.ObjectMeta.Name,
                     +			"registryName": registryName,
                     +		})
                     +	}
+                    +
+                     }
                      func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.DiagnosticResult) bool {
                      	endPoint, err := d.KubeClient.Endpoints(kapi.NamespaceDefault).Get(registryName)
                      	if err != nil {
                     -		r.Errorf("clRegGetEP", err, `Finding endpoints for "%s" service failed. This should never happen. Error: (%[2]T) %[2]v`, registryName, err)
                     +		r.Errorf("DClu1013", err, `Finding endpoints for "%s" service failed. This should never happen. Error: (%[2]T) %[2]v`, registryName, err)
                      		return false
+                     	}
                      	numEP := 0
@@ -243,7 +267,7 @@ func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.Diagn
                      		numEP += len(subs.Addresses)
+                     	}
                      	if numEP != len(pods) {
                     -		r.Warnt("clRegNoEP", nil, clRegNoEP, log.Hash{"registryName": registryName, "numPods": len(pods), "numEP": numEP})
                     +		r.Warnt("DClu1014", nil, clRegNoEP, log.Hash{"registryName": registryName, "numPods": len(pods), "numEP": numEP})
                      		return false
+                     	}
                      	return true
@@ -252,12 +276,12 @@ func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.Diagn
                      func (d *ClusterRegistry) verifyRegistryImageStream(service *kapi.Service, r types.DiagnosticResult) {
                      	imgStream, err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Create(&osapi.ImageStream{ObjectMeta: kapi.ObjectMeta{GenerateName: "diagnostic-test"}})
                      	if err != nil {
                     -		r.Errorf("clRegISCFail", err, "Creating test ImageStream failed. Error: (%T) %[1]v", err)
                     +		r.Errorf("DClu1015", err, "Creating test ImageStream failed. Error: (%T) %[1]v", err)
                      		return
+                     	}
                      	defer func() { // delete what we created, or notify that we couldn't
                      		if err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Delete(imgStream.ObjectMeta.Name); err != nil {
                     -			r.Warnt("clRegISDelFail", err, clRegISDelFail, log.Hash{
                     +			r.Warnt("DClu1016", err, clRegISDelFail, log.Hash{
                      				"name":  imgStream.ObjectMeta.Name,
                      				"error": fmt.Sprintf("(%T) %[1]s", err),
                      			})
@@ -265,14 +289,14 @@ func (d *ClusterRegistry) verifyRegistryImageStream(service *kapi.Service, r typ
                      	}()
                      	imgStream, err = d.OsClient.ImageStreams(kapi.NamespaceDefault).Get(imgStream.ObjectMeta.Name) // status is filled in post-create
                      	if err != nil {
                     -		r.Errorf("clRegISCFail", err, "Getting created test ImageStream failed. Error: (%T) %[1]v", err)
                     +		r.Errorf("DClu1017", err, "Getting created test ImageStream failed. Error: (%T) %[1]v", err)
                      		return
+                     	}
                     -	r.Debugf("clRegISC", "Created test ImageStream: %[1]v", imgStream)
                     +	r.Debugf("DClu1018", "Created test ImageStream: %[1]v", imgStream)
                      	cacheHost := strings.SplitN(imgStream.Status.DockerImageRepository, "/", 2)[0]
                      	serviceHost := fmt.Sprintf("%s:%d", service.Spec.ClusterIP, service.Spec.Ports[0].Port)
                      	if cacheHost != serviceHost {
                     -		r.Errort("clRegISMismatch", nil, clRegISMismatch, log.Hash{
                     +		r.Errort("DClu1019", nil, clRegISMismatch, log.Hash{
                      			"serviceHost":  serviceHost,
                      			"cacheHost":    cacheHost,
                      			"registryName": registryName,

@@ -96,7 +96,8 @@ func (d *ClusterRouter) CanRun() (bool, error) {
                      	if d.KubeClient == nil || d.OsClient == nil {
                      		return false, errors.New("must have kube and os client")
+                     	}
                     -	can, err := adminCan(d.OsClient, kapi.NamespaceDefault, &authorizationapi.SubjectAccessReview{
                     +	can, err := adminCan(d.OsClient, authorizationapi.AuthorizationAttributes{
                     +		Namespace:    kapi.NamespaceDefault,
                      		Verb:         "get",
                      		Resource:     "dc",
                      		ResourceName: routerName,
@@ -128,34 +129,34 @@ func (d *ClusterRouter) Check() types.DiagnosticResult {
                      func (d *ClusterRouter) getRouterDC(r types.DiagnosticResult) *osapi.DeploymentConfig {
                      	dc, err := d.OsClient.DeploymentConfigs(kapi.NamespaceDefault).Get(routerName)
                      	if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
                     -		r.Warnf("clGetRtNone", err, clGetRtNone, routerName)
                     +		r.Warnf("DClu2001", err, clGetRtNone, routerName)
                      		return nil
                      	} else if err != nil {
                     -		r.Errorf("clGetRtFailed", err, clGetRtFailed, routerName, err)
                     +		r.Errorf("DClu2002", err, clGetRtFailed, routerName, err)
                      		return nil
+                     	}
                     -	r.Debugf("clRtFound", "Found default router DC")
                     +	r.Debugf("DClu2003", "Found default router DC")
                      	return dc
+                     }
                      func (d *ClusterRouter) getRouterPods(dc *osapi.DeploymentConfig, r types.DiagnosticResult) *kapi.PodList {
                      	pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(labels.SelectorFromSet(dc.Template.ControllerTemplate.Selector), fields.Everything())
                      	if err != nil {
                     -		r.Errorf("clRtListPods", err, "Finding pods for '%s' DeploymentConfig failed. This should never happen. Error: (%[2]T) %[2]v", routerName, err)
                     +		r.Errorf("DClu2004", err, "Finding pods for '%s' DeploymentConfig failed. This should never happen. Error: (%[2]T) %[2]v", routerName, err)
                      		return nil
+                     	}
                      	running := []kapi.Pod{}
                      	for _, pod := range pods.Items {
                      		if pod.Status.Phase != kapi.PodRunning {
                     -			r.Debugf("clRtPodFound", "router pod with name %s is not running", pod.ObjectMeta.Name)
                     +			r.Debugf("DClu2005", "router pod with name %s is not running", pod.ObjectMeta.Name)
                      		} else {
                      			running = append(running, pod)
                     -			r.Debugf("clRtPodFound", "Found running router pod with name %s", pod.ObjectMeta.Name)
                     +			r.Debugf("DClu2006", "Found running router pod with name %s", pod.ObjectMeta.Name)
+                     		}
+                     	}
                      	pods.Items = running
                      	if len(running) == 0 {
                     -		r.Errorf("clRtNoPods", nil, clRtNoPods, routerName)
                     +		r.Errorf("DClu2007", nil, clRtNoPods, routerName)
                      		return nil
+                     	}
                      	return pods
@@ -192,7 +193,7 @@ var referenceTimestampLayout = "2006-01-02T15:04:05.000000000Z"
                      func (d *ClusterRouter) checkRouterLogs(pod *kapi.Pod, r types.DiagnosticResult) {
                      	scanner, err := d.getPodLogScanner(pod)
                      	if err != nil {
                     -		r.Warnt("clRtPodLog", err, clRtPodLog, log.Hash{
                     +		r.Warnt("DClu2008", err, clRtPodLog, log.Hash{
                      			"error":   fmt.Sprintf("(%T) %[1]v", err),
                      			"podName": pod.ObjectMeta.Name,
                      		})
@@ -207,7 +208,7 @@ func (d *ClusterRouter) checkRouterLogs(pod *kapi.Pod, r types.DiagnosticResult)
                      			// router checks every second. error only if failure is recent.
                      			// of course... we cannot always trust the local clock.
                      			if err == nil && time.Since(stamp).Seconds() < 30.0 {
                     -				r.Errort("clRtPodConn", nil, clRtPodConn, log.Hash{
                     +				r.Errort("DClu2009", nil, clRtPodConn, log.Hash{
                      					"reason":    matches[2],
                      					"timestamp": matches[1],
                      					"podName":   pod.ObjectMeta.Name,

...	...	@@ -151,15 +151,15 @@ var (
151	151
152	152	// Provide a summary at the end
153	153	func (l *Logger) Summary(warningsSeen int, errorsSeen int) {
154		- l.Noticef("summary", "\nSummary of diagnostics execution (version %v):\n", version.Get())
	154	+ l.Noticef("DL0001", "\nSummary of diagnostics execution (version %v):\n", version.Get())
155	155	if warningsSeen > 0 {
156		- l.Noticet("sumWarn", "Warnings seen: {{.warnings}}", Hash{"warnings": warningsSeen})
	156	+ l.Noticet("DL0002", "Warnings seen: {{.warnings}}", Hash{"warnings": warningsSeen})
157	157	}
158	158	if errorsSeen > 0 {
159		- l.Noticet("sumErr", "Errors seen: {{.errors}}", Hash{"errors": errorsSeen})
	159	+ l.Noticet("DL0003", "Errors seen: {{.errors}}", Hash{"errors": errorsSeen})
160	160	}
161	161	if warningsSeen == 0 && errorsSeen == 0 {
162		- l.Notice("sumNone", "Completed with no errors or warnings seen.")
	162	+ l.Notice("DL0004", "Completed with no errors or warnings seen.")
163	163	}
164	164	}
165	165
...	...	@@ -257,13 +257,13 @@ func origin(skip int) string {
257	257	}
258	258	}
259	259	func (l *Logger) logp(level Level, id string, text string) {
260		- l.LogEntry(Entry{id, origin(1), level, Message{ID: id, EvaluatedText: text}})
	260	+ l.LogEntry(Entry{id, origin(2), level, Message{ID: id, EvaluatedText: text}})
261	261	}
262	262	func (l *Logger) logf(level Level, id string, msg string, a ...interface{}) {
263		- l.LogEntry(Entry{id, origin(1), level, Message{ID: id, EvaluatedText: fmt.Sprintf(msg, a...)}})
	263	+ l.LogEntry(Entry{id, origin(2), level, Message{ID: id, EvaluatedText: fmt.Sprintf(msg, a...)}})
264	264	}
265	265	func (l *Logger) logt(level Level, id string, template string, data interface{}) {
266		- l.LogEntry(Entry{id, origin(1), level, Message{ID: id, Template: template, TemplateData: data}})
	266	+ l.LogEntry(Entry{id, origin(2), level, Message{ID: id, Template: template, TemplateData: data}})
267	267	}
268	268
269	269	func (l *Logger) Finish() {

...	...	@@ -37,7 +37,7 @@ func (t *textLogger) Write(entry Entry) {
37	37	}
38	38	text := strings.TrimSpace(entry.Message.EvaluatedText)
39	39	if entry.Level.Level >= WarnLevel.Level {
40		- text = fmt.Sprintf("[ID \"%s\" from %s]\n", entry.ID, entry.Origin) + text
	40	+ text = fmt.Sprintf("[%s from %s]\n", entry.ID, entry.Origin) + text
41	41	}
42	42	if strings.Contains(text, "\n") { // separate multiline comments with newlines
43	43	if !t.lastNewline {

...	...	@@ -46,7 +46,7 @@ func (d UnitStatus) Check() types.DiagnosticResult {
46	46	// Anything that is enabled but not running deserves notice
47	47	for name, unit := range d.SystemdUnits {
48	48	if unit.Enabled && !unit.Active {
49		- r.Errort("sdUnitInactive", nil, sdUnitInactive, log.Hash{"unit": name})
	49	+ r.Errort("DS3001", nil, sdUnitInactive, log.Hash{"unit": name})
50	50	}
51	51	}
52	52	return r
...	...	@@ -56,9 +56,9 @@ func unitRequiresUnit(r types.DiagnosticResult, unit types.SystemdUnit, requires
56	56	templateData := log.Hash{"unit": unit.Name, "required": requires.Name, "reason": reason}
57	57
58	58	if (unit.Active \|\| unit.Enabled) && !requires.Exists {
59		- r.Errort("sdUnitReqLoaded", nil, sdUnitReqLoaded, templateData)
	59	+ r.Errort("DS3002", nil, sdUnitReqLoaded, templateData)
60	60	} else if unit.Active && !requires.Active {
61		- r.Errort("sdUnitReqActive", nil, sdUnitReqActive, templateData)
	61	+ r.Errort("DS3003", nil, sdUnitReqActive, templateData)
62	62	}
63	63	}
64	64

@@ -5,8 +5,8 @@ import (
                      	osclient "github.com/openshift/origin/pkg/client"
+                     )
                     -func adminCan(client *osclient.Client, ns string, sar *authorizationapi.SubjectAccessReview) (bool, error) {
                     -	if resp, err := client.SubjectAccessReviews(ns).Create(sar); err != nil {
                     +func adminCan(client *osclient.Client, action authorizationapi.AuthorizationAttributes) (bool, error) {
                     +	if resp, err := client.SubjectAccessReviews().Create(&authorizationapi.SubjectAccessReview{Action: action}); err != nil {
                      		return false, err
                      	} else if resp.Allowed {
                      		return true, nil

@@ -32,17 +32,17 @@ func (d MasterConfigCheck) CanRun() (bool, error) {
                      func (d MasterConfigCheck) Check() types.DiagnosticResult {
                      	r := types.NewDiagnosticResult(MasterConfigCheckName)
                     -	r.Debugf("discMCfile", "Looking for master config file at '%s'", d.MasterConfigFile)
                     +	r.Debugf("DH0001", "Looking for master config file at '%s'", d.MasterConfigFile)
                      	masterConfig, err := configapilatest.ReadAndResolveMasterConfig(d.MasterConfigFile)
                      	if err != nil {
                     -		r.Errorf("discMCfail", err, "Could not read master config file '%s':\n(%T) %[2]v", d.MasterConfigFile, err)
                     +		r.Errorf("DH0002", err, "Could not read master config file '%s':\n(%T) %[2]v", d.MasterConfigFile, err)
                      		return r
+                     	}
                     -	r.Infof("discMCfound", "Found a master config file: %[1]s", d.MasterConfigFile)
                     +	r.Infof("DH0003", "Found a master config file: %[1]s", d.MasterConfigFile)
                      	for _, err := range configvalidation.ValidateMasterConfig(masterConfig).Errors {
                     -		r.Errorf("discMCinvalid", err, "Validation of master config file '%s' failed:\n(%T) %[2]v", d.MasterConfigFile, err)
                     +		r.Errorf("DH0004", err, "Validation of master config file '%s' failed:\n(%T) %[2]v", d.MasterConfigFile, err)
+                     	}
                      	return r
+                     }

@@ -31,17 +31,17 @@ func (d NodeConfigCheck) CanRun() (bool, error) {
+                     }
                      func (d NodeConfigCheck) Check() types.DiagnosticResult {
                      	r := types.NewDiagnosticResult(NodeConfigCheckName)
                     -	r.Debugf("discNCfile", "Looking for node config file at '%s'", d.NodeConfigFile)
                     +	r.Debugf("DH1001", "Looking for node config file at '%s'", d.NodeConfigFile)
                      	nodeConfig, err := configapilatest.ReadAndResolveNodeConfig(d.NodeConfigFile)
                      	if err != nil {
                     -		r.Errorf("discNCfail", err, "Could not read node config file '%s':\n(%T) %[2]v", d.NodeConfigFile, err)
                     +		r.Errorf("DH1002", err, "Could not read node config file '%s':\n(%T) %[2]v", d.NodeConfigFile, err)
                      		return r
+                     	}
                     -	r.Infof("discNCfound", "Found a node config file: %[1]s", d.NodeConfigFile)
                     +	r.Infof("DH1003", "Found a node config file: %[1]s", d.NodeConfigFile)
                      	for _, err := range configvalidation.ValidateNodeConfig(nodeConfig) {
                     -		r.Errorf("discNCinvalid", err, "Validation of node config file '%s' failed:\n(%T) %[2]v", d.NodeConfigFile, err)
                     +		r.Errorf("DH1004", err, "Validation of node config file '%s' failed:\n(%T) %[2]v", d.NodeConfigFile, err)
+                     	}
                      	return r
+                     }

@@ -42,7 +42,7 @@ func (d AnalyzeLogs) Check() types.DiagnosticResult {
                      	for _, unit := range unitLogSpecs {
                      		if svc := d.SystemdUnits[unit.Name]; svc.Enabled || svc.Active {
                     -			r.Infof("sdCheckLogs", "Checking journalctl logs for '%s' service", unit.Name)
                     +			r.Infof("DS0001", "Checking journalctl logs for '%s' service", unit.Name)
                      			cmd := exec.Command("journalctl", "-ru", unit.Name, "--output=json")
                      			// JSON comes out of journalctl one line per record
@@ -58,7 +58,7 @@ func (d AnalyzeLogs) Check() types.DiagnosticResult {
                      			}(cmd)
                      			if err != nil {
                     -				r.Errorf("sdLogReadErr", err, sdLogReadErr, unit.Name, errStr(err))
                     +				r.Errorf("DS0002", err, sdLogReadErr, unit.Name, errStr(err))
                      				return r
+                     			}
                      			defer func() { // close out pipe once done reading
@@ -75,10 +75,10 @@ func (d AnalyzeLogs) Check() types.DiagnosticResult {
+                     				}
                      				bytes, entry := lineReader.Bytes(), logEntry{}
                      				if err := json.Unmarshal(bytes, &entry); err != nil {
                     -					r.Debugf("sdLogBadJSON", "Couldn't read the JSON for this log message:\n%s\nGot error %s", string(bytes), errStr(err))
                     +					r.Debugf("DS0003", "Couldn't read the JSON for this log message:\n%s\nGot error %s", string(bytes), errStr(err))
                      				} else {
                      					if lineCount > 500 && stampTooOld(entry.TimeStamp, timeLimit) {
                     -						r.Debugf("sdLogTrunc", "Stopped reading %s log: timestamp %s too old", unit.Name, entry.TimeStamp)
                     +						r.Debugf("DS0004", "Stopped reading %s log: timestamp %s too old", unit.Name, entry.TimeStamp)
                      						break // if we've analyzed at least 500 entries, stop when age limit reached (don't scan days of logs)
+                     					}
                      					if unit.StartMatch.MatchString(entry.Message) {

@@ -12,23 +12,23 @@ import (
                      func GetSystemdUnits(logger *log.Logger) map[string]types.SystemdUnit {
                      	systemdUnits := map[string]types.SystemdUnit{}
                     -	logger.Notice("discBeginSysd", "Performing systemd discovery")
                     +	logger.Notice("DS1001", "Performing systemd discovery")
                      	for _, name := range []string{"openshift", "openshift-master", "openshift-node", "openshift-sdn-master", "openshift-sdn-node", "docker", "openvswitch", "iptables", "etcd", "kubernetes"} {
                      		systemdUnits[name] = discoverSystemdUnit(logger, name)
                      		if systemdUnits[name].Exists {
                     -			logger.Debugf("discUnit", "Saw systemd unit %s", name)
                     +			logger.Debugf("DS1002", "Saw systemd unit %s", name)
+                     		}
+                     	}
                     -	logger.Debugf("discUnits", "%v", systemdUnits)
                     +	logger.Debugf("DS1003", "%v", systemdUnits)
                      	return systemdUnits
+                     }
                      func discoverSystemdUnit(logger *log.Logger, name string) types.SystemdUnit {
                      	unit := types.SystemdUnit{Name: name, Exists: false}
                      	if output, err := exec.Command("systemctl", "show", name).Output(); err != nil {
                     -		logger.Errorf("discCtlErr", "Error running `systemctl show %s`: %s\nCannot analyze systemd units.", name, err.Error())
                     +		logger.Errorf("DS1004", "Error running `systemctl show %s`: %s\nCannot analyze systemd units.", name, err.Error())
                      	} else {
                      		attr := make(map[string]string)
@@ -40,7 +40,7 @@ func discoverSystemdUnit(logger *log.Logger, name string) types.SystemdUnit {
+                     		}
                      		if val := attr["LoadState"]; val != "loaded" {
                     -			logger.Debugf("discUnitENoExist", "systemd unit '%s' does not exist. LoadState is '%s'", name, val)
                     +			logger.Debugf("DS1005", "systemd unit '%s' does not exist. LoadState is '%s'", name, val)
                      			return unit // doesn't exist - leave everything blank
                      		} else {
@@ -48,19 +48,19 @@ func discoverSystemdUnit(logger *log.Logger, name string) types.SystemdUnit {
+                     		}
                      		if val := attr["UnitFileState"]; val == "enabled" {
                     -			logger.Debugf("discUnitEnabled", "systemd unit '%s' is enabled - it will start automatically at boot.", name)
                     +			logger.Debugf("DS1006", "systemd unit '%s' is enabled - it will start automatically at boot.", name)
                      			unit.Enabled = true
                      		} else {
                     -			logger.Debugf("discUnitNoEnable", "systemd unit '%s' is not enabled - it does not start automatically at boot. UnitFileState is '%s'", name, val)
                     +			logger.Debugf("DS1007", "systemd unit '%s' is not enabled - it does not start automatically at boot. UnitFileState is '%s'", name, val)
+                     		}
                      		if val := attr["ActiveState"]; val == "active" {
                     -			logger.Debugf("discUnitActive", "systemd unit '%s' is currently running", name)
                     +			logger.Debugf("DS1008", "systemd unit '%s' is currently running", name)
                      			unit.Active = true
                      		} else {
                     -			logger.Debugf("discUnitNoActive", "systemd unit '%s' is not currently running. ActiveState is '%s'; exit code was %d.", name, val, unit.ExitStatus)
                     +			logger.Debugf("DS1009", "systemd unit '%s' is not currently running. ActiveState is '%s'; exit code was %d.", name, val, unit.ExitStatus)
+                     		}
                      		fmt.Sscanf(attr["StatusErrno"], "%d", &unit.ExitStatus) // ignore errors...

@@ -61,26 +61,16 @@ var tlsClientErrorSeen map[string]bool
                      var unitLogSpecs = []*unitSpec{
+                     	{
                      		Name:       "openshift-master",
                     -		StartMatch: regexp.MustCompile("Starting master on"),
                     +		StartMatch: regexp.MustCompile("Starting \\w+ Master"),
                      		LogMatchers: []logMatcher{
                      			badImageTemplate,
+                     			{
                      				Regexp:         regexp.MustCompile("Unable to decode an event from the watch stream: local error: unexpected message"),
                      				Level:          log.InfoLevel,
                     -				Id:             "sdLogOMIgnore",
                     +				Id:             "DS2003",
                      				Interpretation: "You can safely ignore this message.",
                      			},
+                     			{
                     -				Regexp: regexp.MustCompile("HTTP probe error: Get .*/healthz: dial tcp .*:10250: connection refused"),
                     -				Level:  log.InfoLevel,
                     -				Id:     "sdLogOMhzRef",
                     -				Interpretation: `
                     -The master does a health check on nodes that are defined in its records,
                     -and this error is the result when the node is not available yet.
                     -This is not usually a problem, unless it continues in the logs after
                     -the node is actually available.`,
                     -			},
                     -			{
                      				// TODO: don't rely on ipv4 format, should be ipv6 "soon"
                      				Regexp: regexp.MustCompile("http: TLS handshake error from ([\\d.]+):\\d+: remote error: bad certificate"),
                      				Level:  log.WarnLevel,
@@ -90,7 +80,7 @@ the node is actually available.`,
                      					if tlsClientErrorSeen == nil { // first time this message was seen
                      						tlsClientErrorSeen = map[string]bool{client: true}
                      						// TODO: too generic, adjust message depending on subnet of the "from" address
                     -						r.Warn("sdLogOMreBadCert", nil, prelude+`
                     +						r.Warn("DS2001", nil, prelude+`
                      This error indicates that a client attempted to connect to the master
                      HTTPS API server but broke off the connection because the master's
                      certificate is not validated by a cerificate authority (CA) acceptable
@@ -101,8 +91,8 @@ At this time, the master API certificate is signed by a private CA
                      (created the first time the master runs) and clients should have a copy of
                      that CA certificate in order to validate connections to the master. Most
                      likely, either:
                     -1. the master has generated a new CA (after the administrator deleted
                     -   the old one) and the client has a copy of the old CA cert, or
                     +1. the master has generated a new CA (e.g. after the administrator
                     +   deleted the old one) and the client has a copy of the old CA cert, or
 . the client hasn't been configured with a private CA at all (or the
                         wrong one), or
 . the client is attempting to reach the master at a URL that isn't
@@ -131,99 +121,66 @@ log message:
                      					} else if !tlsClientErrorSeen[client] {
                      						tlsClientErrorSeen[client] = true
                     -						r.Warn("sdLogOMreBadCert", nil, prelude+`This message was diagnosed above, but for a different client address.`)
                     +						r.Warn("DS2002", nil, prelude+`This message was diagnosed above, but for a different client address.`)
                      					} // else, it's a repeat, don't mention it
                      					return true // show once for every client failing to connect, not just the first
                      				},
                      			},
                     -			{
                     -				// user &{system:anonymous  [system:unauthenticated]} -> /api/v\\w+/services?namespace="
                     -				Regexp: regexp.MustCompile("system:anonymous\\W*system:unauthenticated\\W*/api/v\\w+/services\\?namespace="),
                     -				Level:  log.WarnLevel,
                     -				Id:     "sdLogOMunauthNode",
                     -				Interpretation: `
                     -This indicates the API server (master) received an unscoped request to
                     -get Services. Requests like this probably come from a node trying to
                     -discover where it should proxy services.
+                    -
                     -However, the request was unauthenticated, so it was denied. The node
                     -either did not offer a client certificate for credential, or offered an
                     -invalid one (not signed by the certificate authority the master uses).
                     -The node will not be able to function without this access.
+                    -
                     -Unfortunately, this message does not tell us *which* node is the
                     -problem. But running diagnostics on your node hosts should find a log
                     -message for any node with this problem.
                     -`,
                     -			},
                      		},
                      	},
+                     	{
                      		Name:       "openshift-node",
                     -		StartMatch: regexp.MustCompile("Starting OpenShift node"), //systemd puts this out; could change
                     +		StartMatch: regexp.MustCompile("Starting \\w+ Node"), //systemd puts this out; could change
                      		LogMatchers: []logMatcher{
                      			badImageTemplate,
+                     			{
                     -				Regexp: regexp.MustCompile("Unable to load services: Get (http\\S+/api/v\\w+/services\\?namespace=): (.+)"), // e.g. x509: certificate signed by unknown authority
                     +				Regexp: regexp.MustCompile(`Unable to register.*"system:anonymous"`),
                      				Level:  log.ErrorLevel,
                     -				Id:     "sdLogONconnMaster",
                     +				Id:     "DS2004",
                      				Interpretation: `
                     -openshift-node could not connect to the master API in order to determine
                     -its responsibilities. This host will not function as a node until this
                     -is resolved. Pods scheduled for this node will remain in pending or
                     -unknown state forever.`,
                     -			},
                     -			{
                     -				Regexp: regexp.MustCompile(`Unable to load services: request.*403 Forbidden: Forbidden: "/api/v\w+/services\?namespace=" denied by default`),
                     -				Level:  log.ErrorLevel,
                     -				Id:     "sdLogONMasterForbids",
                     -				Interpretation: `
                     -openshift-node could not connect to the master API to determine
                     -its responsibilities because it lacks the proper credentials. Nodes
                     -should specify a client certificate in order to identify themselves to
                     -the master. This message typically means that either no client key/cert
                     -was supplied, or it is not validated by the certificate authority (CA)
                     -the master uses. You should supply a correct client key and certificate
                     -in the .kubeconfig specified in node-config.yaml
                     +openshift-node could not register with the master API because it lacks
                     +the proper credentials. Nodes should specify a client certificate in
                     +order to identify themselves to the master. This message typically means
                     +that either no client key/cert was supplied, or it is not validated
                     +by the certificate authority (CA) the master uses. You should supply
                     +a correct client key and certificate in the .kubeconfig specified in
                     +node-config.yaml
                      This host will not function as a node until this is resolved. Pods
                      scheduled for this node will remain in pending or unknown state forever.`,
                      			},
+                     			{
                     -				Regexp: regexp.MustCompile("Could not find an allocated subnet for this minion.*Waiting.."),
                     +				Regexp: regexp.MustCompile("Could not find an allocated subnet for"),
                      				Level:  log.WarnLevel,
                     -				Id:     "sdLogOSNnoSubnet",
                     +				Id:     "DS2005",
                      				Interpretation: `
                      This warning occurs when openshift-node is trying to request the
                      SDN subnet it should be configured with according to the master,
                     -but either can't connect to it ("All the given peers are not reachable")
                     -or has not yet been assigned a subnet ("Key not found").
                     +but either can't connect to it or has not yet been assigned a subnet.
                     -This can just be a matter of waiting for the master to become fully
                     -available and define a record for the node (aka "minion") to use,
                     -and openshift-node will wait until that occurs, so the presence
                     -of this message in the node log isn't necessarily a problem as
                     -long as the SDN is actually working, but this message may help indicate
                     -the problem if it is not working.
                     +This can occur before the master becomes fully available and defines a
                     +record for the node to use; openshift-node will wait until that occurs,
                     +so the presence of this message in the node log isn't necessarily a
                     +problem as long as the SDN is actually working, but this message may
                     +help indicate the problem if it is not working.
                     -If the master is available and this node's record is defined and this
                     -message persists, then it may be a sign of a different misconfiguration.
                     -Unfortunately the message is not specific about why the connection failed.
                     -Check the master's URL in the node configuration.
                     +If the master is available and this log message persists, then it may
                     +be a sign of a different misconfiguration. Check the master's URL in
                     +the node kubeconfig.
                       * Is the protocol http? It should be https.
                     - * Can you reach the address and port from the node using curl?
                     -   ("404 page not found" is correct response)`,
                     + * Can you reach the address and port from the node using curl -k?
                     +`,
                      			},
                      		},
                      	},
+                     	{
                      		Name:       "docker",
                     -		StartMatch: regexp.MustCompile(`Starting Docker Application Container Engine.`), // RHEL Docker at least
                     +		StartMatch: regexp.MustCompile(`Starting Docker`), // RHEL Docker at least
                      		LogMatchers: []logMatcher{
+                     			{
                      				Regexp: regexp.MustCompile(`Usage: docker \\[OPTIONS\\] COMMAND`),
                      				Level:  log.ErrorLevel,
                     -				Id:     "sdLogDbadOpt",
                     +				Id:     "DS2006",
                      				Interpretation: `
                      This indicates that docker failed to parse its command line
                      successfully, so it just printed a standard usage message and exited.
@@ -236,7 +193,7 @@ The node will not run on this host until this is resolved.`,
+                     			{
                      				Regexp: regexp.MustCompile(`^Unable to open the database file: unable to open database file$`),
                      				Level:  log.ErrorLevel,
                     -				Id:     "sdLogDopenDB",
                     +				Id:     "DS2007",
                      				Interpretation: `
                      This indicates that docker failed to record its state to its database.
                      The most likely reason is that it is out of disk space. It is also
@@ -254,7 +211,7 @@ The node will not run on this host until this is resolved.`,
+                     			{
                      				Regexp: regexp.MustCompile(`no space left on device$`),
                      				Level:  log.ErrorLevel,
                     -				Id:     "sdLogDfull",
                     +				Id:     "DS2008",
                      				Interpretation: `
                      This indicates that docker has run out of space for container volumes
                      or metadata (by default, stored in /var/lib/docker, but configurable).
@@ -272,7 +229,7 @@ The node will not run on this host until this is resolved.`,
                      			{ // generic error seen - do this last
                      				Regexp: regexp.MustCompile(`\\slevel="fatal"\\s`),
                      				Level:  log.ErrorLevel,
                     -				Id:     "sdLogDfatal",
                     +				Id:     "DS2009",
                      				Interpretation: `
                      This is not a known problem, but it is causing Docker to crash,
                      so the node will not run on this host until it is resolved.`,