fix SubjectAccessReview usage
fix 401/403 errs, cluster-context abort, controller origin offby1
fix readme
config_contexts - fix wording, remove cruft
simplify adminCan()
better registry error reporting
fix crufty journald log matches
remove meaning from message IDs
| ... | ... |
@@ -24,7 +24,7 @@ func (o DiagnosticsOptions) buildClientDiagnostics(rawConfig *clientcmdapi.Confi |
| 24 | 24 |
// osClient, kubeClient, clientErr := o.Factory.Clients() // use with a diagnostic that needs OpenShift/Kube client |
| 25 | 25 |
_, _, clientErr := o.Factory.Clients() |
| 26 | 26 |
if clientErr != nil {
|
| 27 |
- o.Logger.Notice("clLoadDefaultFailed", "Failed creating client from config; client diagnostics will be limited to config testing")
|
|
| 27 |
+ o.Logger.Notice("CED0001", "Failed creating client from config; client diagnostics will be limited to config testing")
|
|
| 28 | 28 |
available = util.NewStringSet(clientdiags.ConfigContextsName) |
| 29 | 29 |
} |
| 30 | 30 |
|
| ... | ... |
@@ -2,6 +2,7 @@ package diagnostics |
| 2 | 2 |
|
| 3 | 3 |
import ( |
| 4 | 4 |
"fmt" |
| 5 |
+ "regexp" |
|
| 5 | 6 |
"strings" |
| 6 | 7 |
|
| 7 | 8 |
kclient "k8s.io/kubernetes/pkg/client" |
| ... | ... |
@@ -38,8 +39,8 @@ func (o DiagnosticsOptions) buildClusterDiagnostics(rawConfig *clientcmdapi.Conf |
| 38 | 38 |
|
| 39 | 39 |
clusterClient, kclusterClient, found, err := o.findClusterClients(rawConfig) |
| 40 | 40 |
if !found {
|
| 41 |
- o.Logger.Notice("noClustCtx", "No cluster-admin client config found; skipping cluster diagnostics.")
|
|
| 42 |
- return nil, false, err |
|
| 41 |
+ o.Logger.Notice("CED1002", "No cluster-admin client config found; skipping cluster diagnostics.")
|
|
| 42 |
+ return nil, true, err |
|
| 43 | 43 |
} |
| 44 | 44 |
|
| 45 | 45 |
diagnostics := []types.Diagnostic{}
|
| ... | ... |
@@ -64,7 +65,7 @@ func (o DiagnosticsOptions) findClusterClients(rawConfig *clientcmdapi.Config) ( |
| 64 | 64 |
if o.ClientClusterContext != "" { // user has specified cluster context to use
|
| 65 | 65 |
if context, exists := rawConfig.Contexts[o.ClientClusterContext]; exists {
|
| 66 | 66 |
configErr := fmt.Errorf("Specified '%s' as cluster-admin context, but it was not found in your client configuration.", o.ClientClusterContext)
|
| 67 |
- o.Logger.Error("discClustCtx", configErr.Error())
|
|
| 67 |
+ o.Logger.Error("CED1003", configErr.Error())
|
|
| 68 | 68 |
return nil, nil, false, configErr |
| 69 | 69 |
} else if os, kube, found, err := o.makeClusterClients(rawConfig, o.ClientClusterContext, context); found {
|
| 70 | 70 |
return os, kube, true, err |
| ... | ... |
@@ -75,7 +76,7 @@ func (o DiagnosticsOptions) findClusterClients(rawConfig *clientcmdapi.Config) ( |
| 75 | 75 |
currentContext, exists := rawConfig.Contexts[rawConfig.CurrentContext] |
| 76 | 76 |
if !exists { // config specified cluster admin context that doesn't exist; complain and quit
|
| 77 | 77 |
configErr := fmt.Errorf("Current context '%s' not found in client configuration; will not attempt cluster diagnostics.", rawConfig.CurrentContext)
|
| 78 |
- o.Logger.Errorf("discClustCtx", configErr.Error())
|
|
| 78 |
+ o.Logger.Errorf("CED1004", configErr.Error())
|
|
| 79 | 79 |
return nil, nil, false, configErr |
| 80 | 80 |
} |
| 81 | 81 |
// check if current context is already cluster admin |
| ... | ... |
@@ -100,24 +101,29 @@ func (o DiagnosticsOptions) makeClusterClients(rawConfig *clientcmdapi.Config, c |
| 100 | 100 |
overrides := &clientcmd.ConfigOverrides{Context: *context}
|
| 101 | 101 |
clientConfig := clientcmd.NewDefaultClientConfig(*rawConfig, overrides) |
| 102 | 102 |
factory := osclientcmd.NewFactory(clientConfig) |
| 103 |
- o.Logger.Debugf("discClustCtxStart", "Checking if context is cluster-admin: '%s'", contextName)
|
|
| 103 |
+ o.Logger.Debugf("CED1005", "Checking if context is cluster-admin: '%s'", contextName)
|
|
| 104 | 104 |
if osClient, kubeClient, err := factory.Clients(); err != nil {
|
| 105 |
- o.Logger.Debugf("discClustCtx", "Error creating client for context '%s':\n%v", contextName, err)
|
|
| 105 |
+ o.Logger.Debugf("CED1006", "Error creating client for context '%s':\n%v", contextName, err)
|
|
| 106 | 106 |
return nil, nil, false, nil |
| 107 | 107 |
} else {
|
| 108 |
- subjectAccessReview := authorizationapi.SubjectAccessReview{
|
|
| 109 |
- // we assume if you can list nodes, you're the cluster admin. |
|
| 110 |
- Verb: "list", |
|
| 111 |
- Resource: "nodes", |
|
| 112 |
- } |
|
| 113 |
- if resp, err := osClient.SubjectAccessReviews("default").Create(&subjectAccessReview); err != nil {
|
|
| 114 |
- o.Logger.Errorf("discClustCtx", "Error testing cluster-admin access for context '%s':\n%v", contextName, err)
|
|
| 115 |
- return nil, nil, false, err |
|
| 108 |
+ subjectAccessReview := authorizationapi.SubjectAccessReview{Action: authorizationapi.AuthorizationAttributes{
|
|
| 109 |
+ // if you can do everything, you're the cluster admin. |
|
| 110 |
+ Verb: "*", |
|
| 111 |
+ Resource: "*", |
|
| 112 |
+ }} |
|
| 113 |
+ if resp, err := osClient.SubjectAccessReviews().Create(&subjectAccessReview); err != nil {
|
|
| 114 |
+ if regexp.MustCompile(`User "[\w:]+" cannot create \w+ at the cluster scope`).MatchString(err.Error()) {
|
|
| 115 |
+ o.Logger.Debugf("CED1007", "Context '%s' does not have cluster-admin access:\n%v", contextName, err)
|
|
| 116 |
+ return nil, nil, false, nil |
|
| 117 |
+ } else {
|
|
| 118 |
+ o.Logger.Errorf("CED1008", "Unknown error testing cluster-admin access for context '%s':\n%v", contextName, err)
|
|
| 119 |
+ return nil, nil, false, err |
|
| 120 |
+ } |
|
| 116 | 121 |
} else if resp.Allowed {
|
| 117 |
- o.Logger.Infof("discClustCtxFound", "Using context for cluster-admin access: '%s'", contextName)
|
|
| 122 |
+ o.Logger.Infof("CED1009", "Using context for cluster-admin access: '%s'", contextName)
|
|
| 118 | 123 |
return osClient, kubeClient, true, nil |
| 119 | 124 |
} |
| 120 | 125 |
} |
| 121 |
- o.Logger.Debugf("discClustCtx", "Context does not have cluster-admin access: '%s'", contextName)
|
|
| 126 |
+ o.Logger.Debugf("CED1010", "Context does not have cluster-admin access: '%s'", contextName)
|
|
| 122 | 127 |
return nil, nil, false, nil |
| 123 | 128 |
} |
| ... | ... |
@@ -12,7 +12,7 @@ import ( |
| 12 | 12 |
// determine if we even have a client config |
| 13 | 13 |
func (o DiagnosticsOptions) detectClientConfig() (bool, []types.DiagnosticError, []types.DiagnosticError) {
|
| 14 | 14 |
diagnostic := &clientdiagnostics.ConfigLoading{ConfFlagName: config.OpenShiftConfigFlagName, ClientFlags: o.ClientFlags}
|
| 15 |
- o.Logger.Noticet("diagRun", "Determining if client configuration exists for client/cluster diagnostics",
|
|
| 15 |
+ o.Logger.Noticet("CED2011", "Determining if client configuration exists for client/cluster diagnostics",
|
|
| 16 | 16 |
log.Hash{"area": "client", "name": diagnostic.Name(), "diag": diagnostic.Description()})
|
| 17 | 17 |
result := diagnostic.Check() |
| 18 | 18 |
for _, entry := range result.Logs() {
|
| ... | ... |
@@ -133,12 +133,12 @@ func (o DiagnosticsOptions) RunDiagnostics() (bool, error, int, int) {
|
| 133 | 133 |
if len(o.RequestedDiagnostics) == 0 {
|
| 134 | 134 |
o.RequestedDiagnostics = AvailableDiagnostics.List() |
| 135 | 135 |
} else if common := intersection(util.NewStringSet(o.RequestedDiagnostics...), AvailableDiagnostics); len(common) == 0 {
|
| 136 |
- o.Logger.Errort("emptyReqDiag", "None of the requested diagnostics are available:\n {{.requested}}\nPlease try from the following:\n {{.available}}",
|
|
| 136 |
+ o.Logger.Errort("CED3012", "None of the requested diagnostics are available:\n {{.requested}}\nPlease try from the following:\n {{.available}}",
|
|
| 137 | 137 |
log.Hash{"requested": o.RequestedDiagnostics, "available": AvailableDiagnostics.List()})
|
| 138 | 138 |
return false, fmt.Errorf("No requested diagnostics available"), 0, 1
|
| 139 | 139 |
} else if len(common) < len(o.RequestedDiagnostics) {
|
| 140 | 140 |
errors = append(errors, fmt.Errorf("Not all requested diagnostics are available"))
|
| 141 |
- o.Logger.Errort("notAllReqDiag", `
|
|
| 141 |
+ o.Logger.Errort("CED3013", `
|
|
| 142 | 142 |
Of the requested diagnostics: |
| 143 | 143 |
{{.requested}}
|
| 144 | 144 |
only these are available: |
| ... | ... |
@@ -163,13 +163,13 @@ The list of all possible is: |
| 163 | 163 |
errors = append(errors, err) |
| 164 | 164 |
} |
| 165 | 165 |
if !detected { // there just plain isn't any client config file available
|
| 166 |
- o.Logger.Notice("discNoClientConf", "No client configuration specified; skipping client and cluster diagnostics.")
|
|
| 166 |
+ o.Logger.Notice("CED3014", "No client configuration specified; skipping client and cluster diagnostics.")
|
|
| 167 | 167 |
} else if rawConfig, err := o.buildRawConfig(); rawConfig == nil { // client config is totally broken - won't parse etc (problems may have been detected and logged)
|
| 168 |
- o.Logger.Errorf("discBrokenClientConf", "Client configuration failed to load; skipping client and cluster diagnostics due to error: {{.error}}", log.Hash{"error": err.Error()})
|
|
| 168 |
+ o.Logger.Errorf("CED3015", "Client configuration failed to load; skipping client and cluster diagnostics due to error: {{.error}}", log.Hash{"error": err.Error()})
|
|
| 169 | 169 |
errors = append(errors, err) |
| 170 | 170 |
} else {
|
| 171 | 171 |
if err != nil { // error encountered, proceed with caution
|
| 172 |
- o.Logger.Errorf("discClientConfErr", "Client configuration loading encountered an error, but proceeding anyway. Error was:\n{{.error}}", log.Hash{"error": err.Error()})
|
|
| 172 |
+ o.Logger.Errorf("CED3016", "Client configuration loading encountered an error, but proceeding anyway. Error was:\n{{.error}}", log.Hash{"error": err.Error()})
|
|
| 173 | 173 |
errors = append(errors, err) |
| 174 | 174 |
} |
| 175 | 175 |
clientDiags, ok, err := o.buildClientDiagnostics(rawConfig) |
| ... | ... |
@@ -220,7 +220,7 @@ func (o DiagnosticsOptions) Run(diagnostics []types.Diagnostic) (bool, error, in |
| 220 | 220 |
defer func() {
|
| 221 | 221 |
if r := recover(); r != nil {
|
| 222 | 222 |
errorCount += 1 |
| 223 |
- o.Logger.Errort("diagPanic",
|
|
| 223 |
+ o.Logger.Errort("CED3017",
|
|
| 224 | 224 |
"While running the {{.name}} diagnostic, a panic was encountered.\nThis is a bug in diagnostics. Stack trace follows : \n{{.error}}",
|
| 225 | 225 |
log.Hash{"name": diagnostic.Name(), "error": fmt.Sprintf("%v", r)})
|
| 226 | 226 |
} |
| ... | ... |
@@ -228,16 +228,16 @@ func (o DiagnosticsOptions) Run(diagnostics []types.Diagnostic) (bool, error, in |
| 228 | 228 |
|
| 229 | 229 |
if canRun, reason := diagnostic.CanRun(); !canRun {
|
| 230 | 230 |
if reason == nil {
|
| 231 |
- o.Logger.Noticet("diagSkip", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}",
|
|
| 231 |
+ o.Logger.Noticet("CED3018", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}",
|
|
| 232 | 232 |
log.Hash{"name": diagnostic.Name(), "diag": diagnostic.Description()})
|
| 233 | 233 |
} else {
|
| 234 |
- o.Logger.Noticet("diagSkip", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}\nBecause: {{.reason}}",
|
|
| 234 |
+ o.Logger.Noticet("CED3019", "Skipping diagnostic: {{.name}}\nDescription: {{.diag}}\nBecause: {{.reason}}",
|
|
| 235 | 235 |
log.Hash{"name": diagnostic.Name(), "diag": diagnostic.Description(), "reason": reason.Error()})
|
| 236 | 236 |
} |
| 237 | 237 |
return |
| 238 | 238 |
} |
| 239 | 239 |
|
| 240 |
- o.Logger.Noticet("diagRun", "Running diagnostic: {{.name}}\nDescription: {{.diag}}",
|
|
| 240 |
+ o.Logger.Noticet("CED3020", "Running diagnostic: {{.name}}\nDescription: {{.diag}}",
|
|
| 241 | 241 |
log.Hash{"name": diagnostic.Name(), "diag": diagnostic.Description()})
|
| 242 | 242 |
r := diagnostic.Check() |
| 243 | 243 |
for _, entry := range r.Logs() {
|
| ... | ... |
@@ -26,6 +26,19 @@ Diagnostics looks for config files in standard locations. If not found, |
| 26 | 26 |
related diagnostics are just skipped. Non-standard locations can be |
| 27 | 27 |
specified with flags. |
| 28 | 28 |
|
| 29 |
+Standard config file locations are: |
|
| 30 |
+ |
|
| 31 |
+* Client: |
|
| 32 |
+ * as indicated by --config flag |
|
| 33 |
+ * as indicated by $KUBECONFIG env var |
|
| 34 |
+ * ~/.kube/config file |
|
| 35 |
+* Master: |
|
| 36 |
+ * as indicated by --master-config flag |
|
| 37 |
+ * /etc/openshift/master/master-config.yaml |
|
| 38 |
+* Node: |
|
| 39 |
+ * as indicated by --node-config flag |
|
| 40 |
+ * /etc/openshift/node/node-config.yaml |
|
| 41 |
+ |
|
| 29 | 42 |
Host environment |
| 30 | 43 |
================ |
| 31 | 44 |
|
| ... | ... |
@@ -36,14 +49,14 @@ logic. This provides two major benefits: |
| 36 | 36 |
* master/node configuration is based on a configuration file in a standard location |
| 37 | 37 |
* all components log to journald |
| 38 | 38 |
|
| 39 |
-Having configuration files in standard locations means you will generally |
|
| 39 |
+Having configuration files where ansible places them means you will generally |
|
| 40 | 40 |
not even need to specify where to find them. Running: |
| 41 | 41 |
|
| 42 | 42 |
openshift ex diagnostics |
| 43 | 43 |
|
| 44 | 44 |
by itself will look for master and node configs (in addition to client |
| 45 | 45 |
config file) in the standard locations and use them if found; so this |
| 46 |
-should make the Enterprise use case as simple as possible. It's also |
|
| 46 |
+should make the ansible-installed use case as simple as possible. It's also |
|
| 47 | 47 |
very easy to use configuration files when they are not in the expected |
| 48 | 48 |
Enterprise locations: |
| 49 | 49 |
|
| ... | ... |
@@ -53,7 +66,7 @@ Having logs in journald is necessary for the current log analysis |
| 53 | 53 |
logic. Other usage may have logs going into files, output to stdout, |
| 54 | 54 |
combined node/master... it may not be too hard to extend analysis to |
| 55 | 55 |
other log sources but the priority has been to look at journald logs |
| 56 |
-as created by components in Enterprise deployments (including docker, |
|
| 56 |
+as created by components in systemd-based deployments (including docker, |
|
| 57 | 57 |
openvswitch, etc.). |
| 58 | 58 |
|
| 59 | 59 |
Client environment |
| ... | ... |
@@ -97,17 +110,18 @@ paths or flooding them with non-issues that obscure real problems. |
| 97 | 97 |
* Warnings indicate issues that may be a problem but could be valid for |
| 98 | 98 |
some configurations / situations, for example a node being disabled. |
| 99 | 99 |
|
| 100 |
-Enabling automation |
|
| 101 |
-=================== |
|
| 100 |
+**Message IDs** |
|
| 101 |
+ |
|
| 102 |
+All messages should have a unique, unchanging, otherwise-meaningless |
|
| 103 |
+message ID to facilitate the user greping for specific errors/warnings |
|
| 104 |
+without having to depend on text that may change. Although nothing yet |
|
| 105 |
+depends on them being unique, the message ID scheme attempts to ensure |
|
| 106 |
+they are. That scheme is: |
|
| 102 | 107 |
|
| 103 |
-Diagnostic messages are designed to be logged either for human consumption |
|
| 104 |
-("text" format) or for scripting/automation ("yaml" or "json" formats). So
|
|
| 105 |
-messages should: |
|
| 108 |
+ Initials of package + index of file in package + index of message in file |
|
| 106 | 109 |
|
| 107 |
-* Have an ID that is unique and unchanging, such that automated alerts |
|
| 108 |
- could filter on specific IDs rather than rely on message text or level. |
|
| 109 |
-* Log any data that might be relevant in an automated alert as |
|
| 110 |
- template data; for example, when a node is down, include the name of |
|
| 111 |
- the node so that automation could decide how important it is. |
|
| 112 |
-* Not put anything in message template data that cannot be serialized. |
|
| 110 |
+E.g. "DClu1001" is in package diagnostics/cluster (which needed to be |
|
| 111 |
+differentiated from diagnostics/client), the first file indexed, and |
|
| 112 |
+the first message in the file. This format is not important; it's just |
|
| 113 |
+a convenience to help keep IDs unique. But don't change existing IDs. |
|
| 113 | 114 |
|
| ... | ... |
@@ -24,17 +24,8 @@ type ConfigContext struct {
|
| 24 | 24 |
} |
| 25 | 25 |
|
| 26 | 26 |
const ( |
| 27 |
- ConfigContextsName = "ConfigContexts" |
|
| 28 |
- currentContextMissing = `Your client config specifies a current context of '{{.context}}'
|
|
| 29 |
-which is not defined; it is likely that a mistake was introduced while |
|
| 30 |
-manually editing your config. If this is a simple typo, you may be |
|
| 31 |
-able to fix it manually. |
|
| 32 |
-The master creates a fresh client config when it is started; it may be |
|
| 33 |
-useful to use this as a base if available.` |
|
| 27 |
+ ConfigContextsName = "ConfigContexts" |
|
| 34 | 28 |
|
| 35 |
- currentContextSummary = `The current context from client config is '{{.context}}'
|
|
| 36 |
-This will be used by default to contact the master API. |
|
| 37 |
-` |
|
| 38 | 29 |
contextDesc = ` |
| 39 | 30 |
For client config context '{{.context}}':
|
| 40 | 31 |
The server URL is '{{.server}}'
|
| ... | ... |
@@ -93,12 +84,12 @@ fails in this case. |
| 93 | 93 |
However, the most likely explanation is that the server certificate |
| 94 | 94 |
needs to be updated to include the name you are using to reach it. |
| 95 | 95 |
|
| 96 |
-If the master API server is generating its own certificates (which is |
|
| 97 |
-default), then specify the public master address in the master-config.yaml |
|
| 98 |
-or with the --public-master flag is usually the easiest way to do |
|
| 99 |
-this. If you need something more complicated (for instance, multiple |
|
| 100 |
-public addresses for the API, or your own CA), then you will need to |
|
| 101 |
-custom-generate the server certificate with the right names yourself. |
|
| 96 |
+If the master API server is generating its own certificates (which |
|
| 97 |
+is the default), then specifying the public master address in the |
|
| 98 |
+master-config.yaml or with the --public-master flag is usually the easiest |
|
| 99 |
+way to do this. If you need something more complicated (for instance, |
|
| 100 |
+multiple public addresses for the API, or your own CA), then you will need |
|
| 101 |
+to custom-generate the server certificate with the right names yourself. |
|
| 102 | 102 |
|
| 103 | 103 |
If you are unconcerned about any of this, you can add the |
| 104 | 104 |
--insecure-skip-tls-verify flag to bypass secure (TLS) verification, |
| ... | ... |
@@ -121,7 +112,7 @@ we could not reach the host at all. |
| 121 | 121 |
* You may have specified the wrong host address. |
| 122 | 122 |
* This could mean the host is completely unavailable (down). |
| 123 | 123 |
* This could indicate a routing problem or a firewall that simply |
| 124 |
- drops requests rather than responding by reseting the connection. |
|
| 124 |
+ drops requests rather than responding by resetting the connection. |
|
| 125 | 125 |
* It does not generally mean that DNS name resolution failed (which |
| 126 | 126 |
would be a different error) though the problem could be that it |
| 127 | 127 |
gave the wrong address.` |
| ... | ... |
@@ -155,9 +146,9 @@ key/certificate or an access token. Your kubeconfig may not have |
| 155 | 155 |
presented any, or they may be invalid.` |
| 156 | 156 |
clientUnauthz = ` |
| 157 | 157 |
This means that when we tried to make a request to the master API |
| 158 |
-server, the request required credentials that were not presented. |
|
| 159 |
-This can happen when an authentication token expires. Try logging in |
|
| 160 |
-with this user again.` |
|
| 158 |
+server, the request required credentials that were not presented. This |
|
| 159 |
+can happen with an expired or invalid authentication token. Try logging |
|
| 160 |
+in with this user again.` |
|
| 161 | 161 |
) |
| 162 | 162 |
|
| 163 | 163 |
var ( |
| ... | ... |
@@ -191,10 +182,10 @@ func (d ConfigContext) Check() types.DiagnosticResult {
|
| 191 | 191 |
isDefaultContext := d.RawConfig.CurrentContext == d.ContextName |
| 192 | 192 |
|
| 193 | 193 |
// prepare bad news message |
| 194 |
- errorKey := "clientCfgError" |
|
| 194 |
+ errorKey := "DCli0001" |
|
| 195 | 195 |
unusableLine := fmt.Sprintf("The client config context '%s' is unusable", d.ContextName)
|
| 196 | 196 |
if isDefaultContext {
|
| 197 |
- errorKey = "currentccError" |
|
| 197 |
+ errorKey = "DCli0002" |
|
| 198 | 198 |
unusableLine = fmt.Sprintf("The current client config context '%s' is unusable", d.ContextName)
|
| 199 | 199 |
} |
| 200 | 200 |
|
| ... | ... |
@@ -212,7 +203,7 @@ func (d ConfigContext) Check() types.DiagnosticResult {
|
| 212 | 212 |
} |
| 213 | 213 |
authName := context.AuthInfo |
| 214 | 214 |
if _, exists := d.RawConfig.AuthInfos[authName]; !exists {
|
| 215 |
- r.Errorf(errorKey, nil, "%s:\n Client config context '%s' has a user identity '%s' which is not defined.", unusableLine, d.ContextName, authName) |
|
| 215 |
+ r.Errorf(errorKey, nil, "%s:\n Client config context '%s' has a user '%s' which is not defined.", unusableLine, d.ContextName, authName) |
|
| 216 | 216 |
return r |
| 217 | 217 |
} |
| 218 | 218 |
|
| ... | ... |
@@ -230,7 +221,7 @@ func (d ConfigContext) Check() types.DiagnosticResult {
|
| 230 | 230 |
// Actually send a request to see if context has connectivity. |
| 231 | 231 |
// Note: we cannot reuse factories as they cache the clients, so build new factory for each context. |
| 232 | 232 |
osClient, _, err := osclientcmd.NewFactory(kclientcmd.NewDefaultClientConfig(*d.RawConfig, &kclientcmd.ConfigOverrides{Context: *context})).Clients()
|
| 233 |
- // client create now fails if cannot connect to server, so address connectivity errors below |
|
| 233 |
+ // client create now *fails* if cannot connect to server; so, address connectivity errors below |
|
| 234 | 234 |
if err == nil {
|
| 235 | 235 |
if projects, projerr := osClient.Projects().List(labels.Everything(), fields.Everything()); projerr != nil {
|
| 236 | 236 |
err = projerr |
| ... | ... |
@@ -245,9 +236,9 @@ func (d ConfigContext) Check() types.DiagnosticResult {
|
| 245 | 245 |
} |
| 246 | 246 |
msgData["projects"] = list |
| 247 | 247 |
if len(list) == 0 {
|
| 248 |
- r.Infot("CCctxSuccess", msgText+"Successfully requested project list, but it is empty, so user has no access to anything.", msgData)
|
|
| 248 |
+ r.Infot("DCli0003", msgText+"Successfully requested project list, but it is empty, so user has no access to anything.", msgData)
|
|
| 249 | 249 |
} else {
|
| 250 |
- r.Infot("CCctxSuccess", msgText+"Successfully requested project list; has access to project(s):\n {{.projects}}", msgData)
|
|
| 250 |
+ r.Infot("DCli0004", msgText+"Successfully requested project list; has access to project(s):\n {{.projects}}", msgData)
|
|
| 251 | 251 |
} |
| 252 | 252 |
return r |
| 253 | 253 |
} |
| ... | ... |
@@ -260,29 +251,29 @@ func (d ConfigContext) Check() types.DiagnosticResult {
|
| 260 | 260 |
var reason, errId string |
| 261 | 261 |
switch {
|
| 262 | 262 |
case regexp.MustCompile("dial tcp: lookup (\\S+): no such host").MatchString(errMsg):
|
| 263 |
- errId, reason = "clientNoResolve", clientNoResolve |
|
| 263 |
+ errId, reason = "DCli0005", clientNoResolve |
|
| 264 | 264 |
case strings.Contains(errMsg, "x509: certificate signed by unknown authority"): |
| 265 |
- errId, reason = "clientUnknownCa", clientUnknownCa |
|
| 265 |
+ errId, reason = "DCli0006", clientUnknownCa |
|
| 266 | 266 |
case strings.Contains(errMsg, "specifying a root certificates file with the insecure flag is not allowed"): |
| 267 |
- errId, reason = "clientUnneededCa", clientUnneededCa |
|
| 267 |
+ errId, reason = "DCli0007", clientUnneededCa |
|
| 268 | 268 |
case invalidCertNameRx.MatchString(errMsg): |
| 269 | 269 |
match := invalidCertNameRx.FindStringSubmatch(errMsg) |
| 270 | 270 |
serverHost := match[len(match)-1] |
| 271 |
- errId, reason = "clientInvCertName", fmt.Sprintf(clientInvCertName, serverHost) |
|
| 271 |
+ errId, reason = "DCli0008", fmt.Sprintf(clientInvCertName, serverHost) |
|
| 272 | 272 |
case regexp.MustCompile("dial tcp (\\S+): connection refused").MatchString(errMsg):
|
| 273 |
- errId, reason = "clientConnRefused", clientConnRefused |
|
| 273 |
+ errId, reason = "DCli0009", clientConnRefused |
|
| 274 | 274 |
case regexp.MustCompile("dial tcp (\\S+): (?:connection timed out|i/o timeout|no route to host)").MatchString(errMsg):
|
| 275 |
- errId, reason = "clientConnTimeout", clientConnTimeout |
|
| 275 |
+ errId, reason = "DCli0010", clientConnTimeout |
|
| 276 | 276 |
case strings.Contains(errMsg, "malformed HTTP response"): |
| 277 |
- errId, reason = "clientMalformedHTTP", clientMalformedHTTP |
|
| 277 |
+ errId, reason = "DCli0011", clientMalformedHTTP |
|
| 278 | 278 |
case strings.Contains(errMsg, "tls: oversized record received with length"): |
| 279 |
- errId, reason = "clientMalformedTLS", clientMalformedTLS |
|
| 280 |
- case regexp.MustCompile(`403 Forbidden: Forbidden: "/osapi/v\w+/projects?namespace=" denied by default`).MatchString(errMsg): |
|
| 281 |
- errId, reason = "clientUnauthn", clientUnauthn |
|
| 282 |
- case regexp.MustCompile("401 Unauthorized: Unauthorized$").MatchString(errMsg):
|
|
| 283 |
- errId, reason = "clientUnauthz", clientUnauthz |
|
| 279 |
+ errId, reason = "DCli0012", clientMalformedTLS |
|
| 280 |
+ case strings.Contains(errMsg, `User "system:anonymous" cannot`): |
|
| 281 |
+ errId, reason = "DCli0013", clientUnauthn |
|
| 282 |
+ case strings.Contains(errMsg, "provide credentials"): |
|
| 283 |
+ errId, reason = "DCli0014", clientUnauthz |
|
| 284 | 284 |
default: |
| 285 |
- errId, reason = "clientUnknownConnErr", `Diagnostics does not have an explanation for what this means. Please report this error so one can be added.` |
|
| 285 |
+ errId, reason = "DCli0015", `Diagnostics does not have an explanation for what this means. Please report this error so one can be added.` |
|
| 286 | 286 |
} |
| 287 | 287 |
r.Errort(errId, err, msgText+"{{.errMsg}}\n"+reason, msgData)
|
| 288 | 288 |
return r |
| ... | ... |
@@ -65,7 +65,7 @@ func (d *ConfigLoading) Check() types.DiagnosticResult {
|
| 65 | 65 |
if foundPath != "" {
|
| 66 | 66 |
if confFlagValue != "" && confFlagValue != foundPath {
|
| 67 | 67 |
// found config but not where --config said |
| 68 |
- r.Errorf("discCCnotFlag", nil, `
|
|
| 68 |
+ r.Errorf("DCli1001", nil, `
|
|
| 69 | 69 |
The client configuration file was not found where the --%s flag indicated: |
| 70 | 70 |
%s |
| 71 | 71 |
A config file was found at the following location: |
| ... | ... |
@@ -76,7 +76,7 @@ with the --%[1]s flag, or just not specify the flag. |
| 76 | 76 |
} |
| 77 | 77 |
} else { // not found, check for master-generated ones to recommend
|
| 78 | 78 |
if confFlagValue != "" {
|
| 79 |
- r.Errorf("discCCnotFlag", nil, "Did not find config file where --%s=%s indicated", d.ConfFlagName, confFlagValue)
|
|
| 79 |
+ r.Errorf("DCli1002", nil, "Did not find config file where --%s=%s indicated", d.ConfFlagName, confFlagValue)
|
|
| 80 | 80 |
} |
| 81 | 81 |
adminWarningF := ` |
| 82 | 82 |
No client config file was available; however, one exists at |
| ... | ... |
@@ -98,7 +98,7 @@ location for use by the client and diagnostics. |
| 98 | 98 |
for _, path := range adminPaths {
|
| 99 | 99 |
msg := fmt.Sprintf("Looking for a possible client config at %s\n", path)
|
| 100 | 100 |
if d.canOpenConfigFile(path, msg, r) {
|
| 101 |
- r.Warnf("discCCautoPath", nil, adminWarningF, config.OpenShiftConfigPathEnvVar, path, config.RecommendedHomeFile)
|
|
| 101 |
+ r.Warnf("DCli1003", nil, adminWarningF, config.OpenShiftConfigPathEnvVar, path, config.RecommendedHomeFile)
|
|
| 102 | 102 |
break |
| 103 | 103 |
} |
| 104 | 104 |
} |
| ... | ... |
@@ -115,28 +115,28 @@ func (d ConfigLoading) canOpenConfigFile(path string, errmsg string, r types.Dia |
| 115 | 115 |
if path == "" { // empty param/envvar
|
| 116 | 116 |
return false |
| 117 | 117 |
} else if file, err = os.Open(path); err == nil {
|
| 118 |
- r.Debugt("discOpenCC", "Reading client config at {{.path}}", log.Hash{"path": path})
|
|
| 118 |
+ r.Debugt("DCli1004", "Reading client config at {{.path}}", log.Hash{"path": path})
|
|
| 119 | 119 |
} else if errmsg == "" {
|
| 120 |
- r.Debugf("discOpenCCNo", "Could not read client config at %s:\n%#v", path, err)
|
|
| 120 |
+ r.Debugf("DCli1005", "Could not read client config at %s:\n%#v", path, err)
|
|
| 121 | 121 |
} else if os.IsNotExist(err) {
|
| 122 |
- r.Debug("discOpenCCNoExist", errmsg+"but that file does not exist.")
|
|
| 122 |
+ r.Debug("DCli1006", errmsg+"but that file does not exist.")
|
|
| 123 | 123 |
} else if os.IsPermission(err) {
|
| 124 |
- r.Error("discOpenCCNoPerm", err, errmsg+"but lack permission to read that file.")
|
|
| 124 |
+ r.Error("DCli1007", err, errmsg+"but lack permission to read that file.")
|
|
| 125 | 125 |
} else {
|
| 126 |
- r.Errorf("discOpenCCErr", err, "%sbut there was an error opening it:\n%#v", errmsg, err)
|
|
| 126 |
+ r.Errorf("DCli1008", err, "%sbut there was an error opening it:\n%#v", errmsg, err)
|
|
| 127 | 127 |
} |
| 128 | 128 |
if file != nil { // it is open for reading
|
| 129 | 129 |
defer file.Close() |
| 130 | 130 |
if buffer, err := ioutil.ReadAll(file); err != nil {
|
| 131 |
- r.Errorf("discCCReadErr", err, "Unexpected error while reading client config file (%s): %v", path, err)
|
|
| 131 |
+ r.Errorf("DCli1009", err, "Unexpected error while reading client config file (%s): %v", path, err)
|
|
| 132 | 132 |
} else if _, err := clientcmd.Load(buffer); err != nil {
|
| 133 |
- r.Errorf("discCCYamlErr", err, `
|
|
| 133 |
+ r.Errorf("DCli1010", err, `
|
|
| 134 | 134 |
Error reading YAML from client config file (%s): |
| 135 | 135 |
%v |
| 136 | 136 |
This file may have been truncated or mis-edited. |
| 137 | 137 |
Please fix, remove, or obtain a new client config`, file.Name(), err) |
| 138 | 138 |
} else {
|
| 139 |
- r.Infof("discCCRead", "Successfully read a client config file at '%s'", path)
|
|
| 139 |
+ r.Infof("DCli1011", "Successfully read a client config file at '%s'", path)
|
|
| 140 | 140 |
/* Note, we're not going to use this config file directly. |
| 141 | 141 |
* Instead, we'll defer to the openshift client code to assimilate |
| 142 | 142 |
* flags, env vars, and the potential hierarchy of config files |
| ... | ... |
@@ -67,7 +67,7 @@ func (d *NodeDefinitions) CanRun() (bool, error) {
|
| 67 | 67 |
if d.KubeClient == nil || d.OsClient == nil {
|
| 68 | 68 |
return false, errors.New("must have kube and os client")
|
| 69 | 69 |
} |
| 70 |
- can, err := adminCan(d.OsClient, kapi.NamespaceDefault, &authorizationapi.SubjectAccessReview{
|
|
| 70 |
+ can, err := adminCan(d.OsClient, authorizationapi.AuthorizationAttributes{
|
|
| 71 | 71 |
Verb: "list", |
| 72 | 72 |
Resource: "nodes", |
| 73 | 73 |
}) |
| ... | ... |
@@ -75,7 +75,7 @@ func (d *NodeDefinitions) CanRun() (bool, error) {
|
| 75 | 75 |
msg := log.Message{ID: "clGetNodesFailed", EvaluatedText: fmt.Sprintf(clientErrorGettingNodes, err)}
|
| 76 | 76 |
return false, types.DiagnosticError{msg.ID, &msg, err}
|
| 77 | 77 |
} else if !can {
|
| 78 |
- msg := log.Message{ID: "clGetNodesFailed", EvaluatedText: "Client does not have cluster-admin access and cannot see node records"}
|
|
| 78 |
+ msg := log.Message{ID: "clGetNodesFailed", EvaluatedText: "Client does not have access to see node status"}
|
|
| 79 | 79 |
return false, types.DiagnosticError{msg.ID, &msg, err}
|
| 80 | 80 |
} |
| 81 | 81 |
return true, nil |
| ... | ... |
@@ -86,7 +86,7 @@ func (d *NodeDefinitions) Check() types.DiagnosticResult {
|
| 86 | 86 |
|
| 87 | 87 |
nodes, err := d.KubeClient.Nodes().List(labels.LabelSelector{}, fields.Everything())
|
| 88 | 88 |
if err != nil {
|
| 89 |
- r.Errorf("clGetNodesFailed", err, clientErrorGettingNodes, err)
|
|
| 89 |
+ r.Errorf("DClu0001", err, clientErrorGettingNodes, err)
|
|
| 90 | 90 |
return r |
| 91 | 91 |
} |
| 92 | 92 |
|
| ... | ... |
@@ -110,15 +110,15 @@ func (d *NodeDefinitions) Check() types.DiagnosticResult {
|
| 110 | 110 |
templateData["status"] = ready.Status |
| 111 | 111 |
templateData["reason"] = ready.Reason |
| 112 | 112 |
} |
| 113 |
- r.Warnt("clNodeNotReady", nil, nodeNotReady, templateData)
|
|
| 113 |
+ r.Warnt("DClu0002", nil, nodeNotReady, templateData)
|
|
| 114 | 114 |
} else if node.Spec.Unschedulable {
|
| 115 |
- r.Warnt("clNodeNotSched", nil, nodeNotSched, log.Hash{"node": node.Name})
|
|
| 115 |
+ r.Warnt("DClu0003", nil, nodeNotSched, log.Hash{"node": node.Name})
|
|
| 116 | 116 |
} else {
|
| 117 | 117 |
anyNodesAvail = true |
| 118 | 118 |
} |
| 119 | 119 |
} |
| 120 | 120 |
if !anyNodesAvail {
|
| 121 |
- r.Error("clNoAvailNodes", nil, "There were no nodes available to use. No new pods can be scheduled.")
|
|
| 121 |
+ r.Error("DClu0004", nil, "There were no nodes available to use. No new pods can be scheduled.")
|
|
| 122 | 122 |
} |
| 123 | 123 |
|
| 124 | 124 |
return r |
| ... | ... |
@@ -4,7 +4,6 @@ import ( |
| 4 | 4 |
"bufio" |
| 5 | 5 |
"fmt" |
| 6 | 6 |
"reflect" |
| 7 |
- "regexp" |
|
| 8 | 7 |
"strings" |
| 9 | 8 |
|
| 10 | 9 |
kapi "k8s.io/kubernetes/pkg/api" |
| ... | ... |
@@ -80,12 +79,20 @@ succeeding but not triggering deployments (as they wait on notifications |
| 80 | 80 |
to the ImageStream from the build). |
| 81 | 81 |
|
| 82 | 82 |
There are many reasons for this step to fail, including invalid |
| 83 |
-credentials, DNS failures, network errors, and so on. Examine the |
|
| 84 |
-following error message from the registry pod logs to determine the |
|
| 85 |
-problem: |
|
| 83 |
+credentials, master outages, DNS failures, network errors, and so on. It |
|
| 84 |
+can be temporary or ongoing. Check the most recent error message from the |
|
| 85 |
+registry pod logs to determine the nature of the problem: |
|
| 86 | 86 |
|
| 87 | 87 |
{{.log}}`
|
| 88 | 88 |
|
| 89 |
+ clRegPodErr = ` |
|
| 90 |
+The pod logs for the "{{.podName}}" pod belonging to
|
|
| 91 |
+the "{{.registryName}}" service indicated unknown errors.
|
|
| 92 |
+This could result in problems with builds or deployments. |
|
| 93 |
+Please examine the log entries to determine if there might be |
|
| 94 |
+any related problems: |
|
| 95 |
+{{.log}}`
|
|
| 96 |
+ |
|
| 89 | 97 |
clRegNoEP = ` |
| 90 | 98 |
The "{{.registryName}}" service exists with {{.numPods}} associated pod(s), but there
|
| 91 | 99 |
are {{.numEP}} endpoints in the "{{.registryName}}" service.
|
| ... | ... |
@@ -134,7 +141,8 @@ func (d *ClusterRegistry) CanRun() (bool, error) {
|
| 134 | 134 |
if d.OsClient == nil || d.KubeClient == nil {
|
| 135 | 135 |
return false, fmt.Errorf("must have kube and os clients")
|
| 136 | 136 |
} |
| 137 |
- return adminCan(d.OsClient, kapi.NamespaceDefault, &authorizationapi.SubjectAccessReview{
|
|
| 137 |
+ return adminCan(d.OsClient, authorizationapi.AuthorizationAttributes{
|
|
| 138 |
+ Namespace: kapi.NamespaceDefault, |
|
| 138 | 139 |
Verb: "get", |
| 139 | 140 |
Resource: "services", |
| 140 | 141 |
ResourceName: registryName, |
| ... | ... |
@@ -146,7 +154,7 @@ func (d *ClusterRegistry) Check() types.DiagnosticResult {
|
| 146 | 146 |
if service := d.getRegistryService(r); service != nil {
|
| 147 | 147 |
// Check that it actually has pod(s) selected and running |
| 148 | 148 |
if runningPods := d.getRegistryPods(service, r); len(runningPods) == 0 {
|
| 149 |
- r.Errorf("clRegNoRunningPods ", nil, clRegNoRunningPods, registryName)
|
|
| 149 |
+ r.Errorf("DClu1001", nil, clRegNoRunningPods, registryName)
|
|
| 150 | 150 |
return r |
| 151 | 151 |
} else if d.checkRegistryEndpoints(runningPods, r) { // Check that matching endpoint exists on the service
|
| 152 | 152 |
// attempt to create an imagestream and see if it gets the same registry service IP from the service cache |
| ... | ... |
@@ -159,13 +167,13 @@ func (d *ClusterRegistry) Check() types.DiagnosticResult {
|
| 159 | 159 |
func (d *ClusterRegistry) getRegistryService(r types.DiagnosticResult) *kapi.Service {
|
| 160 | 160 |
service, err := d.KubeClient.Services(kapi.NamespaceDefault).Get(registryName) |
| 161 | 161 |
if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
|
| 162 |
- r.Warnf("clGetRegNone", err, clGetRegNone, registryName, kapi.NamespaceDefault)
|
|
| 162 |
+ r.Warnf("DClu1002", err, clGetRegNone, registryName, kapi.NamespaceDefault)
|
|
| 163 | 163 |
return nil |
| 164 | 164 |
} else if err != nil {
|
| 165 |
- r.Errorf("clGetRegFailed", err, clGetRegFailed, err)
|
|
| 165 |
+ r.Errorf("DClu1003", err, clGetRegFailed, err)
|
|
| 166 | 166 |
return nil |
| 167 | 167 |
} |
| 168 |
- r.Debugf("clRegFound", "Found %s service with ports %v", registryName, service.Spec.Ports)
|
|
| 168 |
+ r.Debugf("DClu1004", "Found %s service with ports %v", registryName, service.Spec.Ports)
|
|
| 169 | 169 |
return service |
| 170 | 170 |
} |
| 171 | 171 |
|
| ... | ... |
@@ -173,24 +181,24 @@ func (d *ClusterRegistry) getRegistryPods(service *kapi.Service, r types.Diagnos |
| 173 | 173 |
runningPods := []*kapi.Pod{}
|
| 174 | 174 |
pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(labels.SelectorFromSet(service.Spec.Selector), fields.Everything()) |
| 175 | 175 |
if err != nil {
|
| 176 |
- r.Errorf("clRegListPods", err, "Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err)
|
|
| 176 |
+ r.Errorf("DClu1005", err, "Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err)
|
|
| 177 | 177 |
return runningPods |
| 178 | 178 |
} else if len(pods.Items) < 1 {
|
| 179 |
- r.Errorf("clRegNoPods", nil, clRegNoPods, registryName)
|
|
| 179 |
+ r.Errorf("DClu1006", nil, clRegNoPods, registryName)
|
|
| 180 | 180 |
return runningPods |
| 181 | 181 |
} else if len(pods.Items) > 1 {
|
| 182 | 182 |
// multiple registry pods using EmptyDir will be inconsistent |
| 183 | 183 |
for _, volume := range pods.Items[0].Spec.Volumes {
|
| 184 | 184 |
if volume.Name == registryVolume && volume.EmptyDir != nil {
|
| 185 |
- r.Errorf("clRegMultiPods", nil, clRegMultiPods, registryName)
|
|
| 185 |
+ r.Errorf("DClu1007", nil, clRegMultiPods, registryName)
|
|
| 186 | 186 |
break |
| 187 | 187 |
} |
| 188 | 188 |
} |
| 189 | 189 |
} |
| 190 | 190 |
for _, pod := range pods.Items {
|
| 191 |
- r.Debugf("clRegPodFound", "Found %s pod with name %s", registryName, pod.ObjectMeta.Name)
|
|
| 191 |
+ r.Debugf("DClu1008", "Found %s pod with name %s", registryName, pod.ObjectMeta.Name)
|
|
| 192 | 192 |
if pod.Status.Phase != kapi.PodRunning {
|
| 193 |
- r.Warnf("clRegPodDown", nil, clRegPodDown, pod.ObjectMeta.Name, registryName)
|
|
| 193 |
+ r.Warnf("DClu1009", nil, clRegPodDown, pod.ObjectMeta.Name, registryName)
|
|
| 194 | 194 |
} else {
|
| 195 | 195 |
runningPods = append(runningPods, &pod) |
| 196 | 196 |
// Check the logs for that pod for common issues (credentials, DNS resolution failure) |
| ... | ... |
@@ -209,7 +217,7 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes |
| 209 | 209 |
Param("container", pod.Spec.Containers[0].Name).
|
| 210 | 210 |
Stream() |
| 211 | 211 |
if err != nil {
|
| 212 |
- r.Warnt("clRegPodLog", nil, clRegPodLog, log.Hash{
|
|
| 212 |
+ r.Warnt("DClu1010", nil, clRegPodLog, log.Hash{
|
|
| 213 | 213 |
"error": fmt.Sprintf("(%T) %[1]v", err),
|
| 214 | 214 |
"podName": pod.ObjectMeta.Name, |
| 215 | 215 |
"registryName": registryName, |
| ... | ... |
@@ -218,24 +226,40 @@ func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticRes |
| 218 | 218 |
} |
| 219 | 219 |
defer readCloser.Close() |
| 220 | 220 |
|
| 221 |
+ clientError := "" |
|
| 222 |
+ registryError := "" |
|
| 221 | 223 |
scanner := bufio.NewScanner(readCloser) |
| 222 | 224 |
for scanner.Scan() {
|
| 223 | 225 |
logLine := scanner.Text() |
| 224 |
- if regexp.MustCompile(`level=error msg="client error: Post http(\S+)/subjectaccessreviews`).MatchString(logLine) {
|
|
| 225 |
- r.Errort("clRegPodConn", nil, clRegPodConn, log.Hash{
|
|
| 226 |
- "log": logLine, |
|
| 227 |
- "podName": pod.ObjectMeta.Name, |
|
| 228 |
- "registryName": registryName, |
|
| 229 |
- }) |
|
| 230 |
- break |
|
| 226 |
+ // TODO: once the logging API gets "since" and "tail" and "limit", limit to more recent log entries |
|
| 227 |
+ // https://github.com/kubernetes/kubernetes/issues/12447 |
|
| 228 |
+ if strings.Contains(logLine, `level=error msg="client error:`) {
|
|
| 229 |
+ clientError = logLine // end up showing only the most recent client error |
|
| 230 |
+ } else if strings.Contains(logLine, "level=error msg=") {
|
|
| 231 |
+ registryError += "\n" + logLine // gather generic errors |
|
| 231 | 232 |
} |
| 232 | 233 |
} |
| 234 |
+ if clientError != "" {
|
|
| 235 |
+ r.Errort("DClu1011", nil, clRegPodConn, log.Hash{
|
|
| 236 |
+ "log": clientError, |
|
| 237 |
+ "podName": pod.ObjectMeta.Name, |
|
| 238 |
+ "registryName": registryName, |
|
| 239 |
+ }) |
|
| 240 |
+ } |
|
| 241 |
+ if registryError != "" {
|
|
| 242 |
+ r.Warnt("DClu1012", nil, clRegPodErr, log.Hash{
|
|
| 243 |
+ "log": registryError, |
|
| 244 |
+ "podName": pod.ObjectMeta.Name, |
|
| 245 |
+ "registryName": registryName, |
|
| 246 |
+ }) |
|
| 247 |
+ } |
|
| 248 |
+ |
|
| 233 | 249 |
} |
| 234 | 250 |
|
| 235 | 251 |
func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.DiagnosticResult) bool {
|
| 236 | 252 |
endPoint, err := d.KubeClient.Endpoints(kapi.NamespaceDefault).Get(registryName) |
| 237 | 253 |
if err != nil {
|
| 238 |
- r.Errorf("clRegGetEP", err, `Finding endpoints for "%s" service failed. This should never happen. Error: (%[2]T) %[2]v`, registryName, err)
|
|
| 254 |
+ r.Errorf("DClu1013", err, `Finding endpoints for "%s" service failed. This should never happen. Error: (%[2]T) %[2]v`, registryName, err)
|
|
| 239 | 255 |
return false |
| 240 | 256 |
} |
| 241 | 257 |
numEP := 0 |
| ... | ... |
@@ -243,7 +267,7 @@ func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.Diagn |
| 243 | 243 |
numEP += len(subs.Addresses) |
| 244 | 244 |
} |
| 245 | 245 |
if numEP != len(pods) {
|
| 246 |
- r.Warnt("clRegNoEP", nil, clRegNoEP, log.Hash{"registryName": registryName, "numPods": len(pods), "numEP": numEP})
|
|
| 246 |
+ r.Warnt("DClu1014", nil, clRegNoEP, log.Hash{"registryName": registryName, "numPods": len(pods), "numEP": numEP})
|
|
| 247 | 247 |
return false |
| 248 | 248 |
} |
| 249 | 249 |
return true |
| ... | ... |
@@ -252,12 +276,12 @@ func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.Diagn |
| 252 | 252 |
func (d *ClusterRegistry) verifyRegistryImageStream(service *kapi.Service, r types.DiagnosticResult) {
|
| 253 | 253 |
imgStream, err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Create(&osapi.ImageStream{ObjectMeta: kapi.ObjectMeta{GenerateName: "diagnostic-test"}})
|
| 254 | 254 |
if err != nil {
|
| 255 |
- r.Errorf("clRegISCFail", err, "Creating test ImageStream failed. Error: (%T) %[1]v", err)
|
|
| 255 |
+ r.Errorf("DClu1015", err, "Creating test ImageStream failed. Error: (%T) %[1]v", err)
|
|
| 256 | 256 |
return |
| 257 | 257 |
} |
| 258 | 258 |
defer func() { // delete what we created, or notify that we couldn't
|
| 259 | 259 |
if err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Delete(imgStream.ObjectMeta.Name); err != nil {
|
| 260 |
- r.Warnt("clRegISDelFail", err, clRegISDelFail, log.Hash{
|
|
| 260 |
+ r.Warnt("DClu1016", err, clRegISDelFail, log.Hash{
|
|
| 261 | 261 |
"name": imgStream.ObjectMeta.Name, |
| 262 | 262 |
"error": fmt.Sprintf("(%T) %[1]s", err),
|
| 263 | 263 |
}) |
| ... | ... |
@@ -265,14 +289,14 @@ func (d *ClusterRegistry) verifyRegistryImageStream(service *kapi.Service, r typ |
| 265 | 265 |
}() |
| 266 | 266 |
imgStream, err = d.OsClient.ImageStreams(kapi.NamespaceDefault).Get(imgStream.ObjectMeta.Name) // status is filled in post-create |
| 267 | 267 |
if err != nil {
|
| 268 |
- r.Errorf("clRegISCFail", err, "Getting created test ImageStream failed. Error: (%T) %[1]v", err)
|
|
| 268 |
+ r.Errorf("DClu1017", err, "Getting created test ImageStream failed. Error: (%T) %[1]v", err)
|
|
| 269 | 269 |
return |
| 270 | 270 |
} |
| 271 |
- r.Debugf("clRegISC", "Created test ImageStream: %[1]v", imgStream)
|
|
| 271 |
+ r.Debugf("DClu1018", "Created test ImageStream: %[1]v", imgStream)
|
|
| 272 | 272 |
cacheHost := strings.SplitN(imgStream.Status.DockerImageRepository, "/", 2)[0] |
| 273 | 273 |
serviceHost := fmt.Sprintf("%s:%d", service.Spec.ClusterIP, service.Spec.Ports[0].Port)
|
| 274 | 274 |
if cacheHost != serviceHost {
|
| 275 |
- r.Errort("clRegISMismatch", nil, clRegISMismatch, log.Hash{
|
|
| 275 |
+ r.Errort("DClu1019", nil, clRegISMismatch, log.Hash{
|
|
| 276 | 276 |
"serviceHost": serviceHost, |
| 277 | 277 |
"cacheHost": cacheHost, |
| 278 | 278 |
"registryName": registryName, |
| ... | ... |
@@ -96,7 +96,8 @@ func (d *ClusterRouter) CanRun() (bool, error) {
|
| 96 | 96 |
if d.KubeClient == nil || d.OsClient == nil {
|
| 97 | 97 |
return false, errors.New("must have kube and os client")
|
| 98 | 98 |
} |
| 99 |
- can, err := adminCan(d.OsClient, kapi.NamespaceDefault, &authorizationapi.SubjectAccessReview{
|
|
| 99 |
+ can, err := adminCan(d.OsClient, authorizationapi.AuthorizationAttributes{
|
|
| 100 |
+ Namespace: kapi.NamespaceDefault, |
|
| 100 | 101 |
Verb: "get", |
| 101 | 102 |
Resource: "dc", |
| 102 | 103 |
ResourceName: routerName, |
| ... | ... |
@@ -128,34 +129,34 @@ func (d *ClusterRouter) Check() types.DiagnosticResult {
|
| 128 | 128 |
func (d *ClusterRouter) getRouterDC(r types.DiagnosticResult) *osapi.DeploymentConfig {
|
| 129 | 129 |
dc, err := d.OsClient.DeploymentConfigs(kapi.NamespaceDefault).Get(routerName) |
| 130 | 130 |
if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
|
| 131 |
- r.Warnf("clGetRtNone", err, clGetRtNone, routerName)
|
|
| 131 |
+ r.Warnf("DClu2001", err, clGetRtNone, routerName)
|
|
| 132 | 132 |
return nil |
| 133 | 133 |
} else if err != nil {
|
| 134 |
- r.Errorf("clGetRtFailed", err, clGetRtFailed, routerName, err)
|
|
| 134 |
+ r.Errorf("DClu2002", err, clGetRtFailed, routerName, err)
|
|
| 135 | 135 |
return nil |
| 136 | 136 |
} |
| 137 |
- r.Debugf("clRtFound", "Found default router DC")
|
|
| 137 |
+ r.Debugf("DClu2003", "Found default router DC")
|
|
| 138 | 138 |
return dc |
| 139 | 139 |
} |
| 140 | 140 |
|
| 141 | 141 |
func (d *ClusterRouter) getRouterPods(dc *osapi.DeploymentConfig, r types.DiagnosticResult) *kapi.PodList {
|
| 142 | 142 |
pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(labels.SelectorFromSet(dc.Template.ControllerTemplate.Selector), fields.Everything()) |
| 143 | 143 |
if err != nil {
|
| 144 |
- r.Errorf("clRtListPods", err, "Finding pods for '%s' DeploymentConfig failed. This should never happen. Error: (%[2]T) %[2]v", routerName, err)
|
|
| 144 |
+ r.Errorf("DClu2004", err, "Finding pods for '%s' DeploymentConfig failed. This should never happen. Error: (%[2]T) %[2]v", routerName, err)
|
|
| 145 | 145 |
return nil |
| 146 | 146 |
} |
| 147 | 147 |
running := []kapi.Pod{}
|
| 148 | 148 |
for _, pod := range pods.Items {
|
| 149 | 149 |
if pod.Status.Phase != kapi.PodRunning {
|
| 150 |
- r.Debugf("clRtPodFound", "router pod with name %s is not running", pod.ObjectMeta.Name)
|
|
| 150 |
+ r.Debugf("DClu2005", "router pod with name %s is not running", pod.ObjectMeta.Name)
|
|
| 151 | 151 |
} else {
|
| 152 | 152 |
running = append(running, pod) |
| 153 |
- r.Debugf("clRtPodFound", "Found running router pod with name %s", pod.ObjectMeta.Name)
|
|
| 153 |
+ r.Debugf("DClu2006", "Found running router pod with name %s", pod.ObjectMeta.Name)
|
|
| 154 | 154 |
} |
| 155 | 155 |
} |
| 156 | 156 |
pods.Items = running |
| 157 | 157 |
if len(running) == 0 {
|
| 158 |
- r.Errorf("clRtNoPods", nil, clRtNoPods, routerName)
|
|
| 158 |
+ r.Errorf("DClu2007", nil, clRtNoPods, routerName)
|
|
| 159 | 159 |
return nil |
| 160 | 160 |
} |
| 161 | 161 |
return pods |
| ... | ... |
@@ -192,7 +193,7 @@ var referenceTimestampLayout = "2006-01-02T15:04:05.000000000Z" |
| 192 | 192 |
func (d *ClusterRouter) checkRouterLogs(pod *kapi.Pod, r types.DiagnosticResult) {
|
| 193 | 193 |
scanner, err := d.getPodLogScanner(pod) |
| 194 | 194 |
if err != nil {
|
| 195 |
- r.Warnt("clRtPodLog", err, clRtPodLog, log.Hash{
|
|
| 195 |
+ r.Warnt("DClu2008", err, clRtPodLog, log.Hash{
|
|
| 196 | 196 |
"error": fmt.Sprintf("(%T) %[1]v", err),
|
| 197 | 197 |
"podName": pod.ObjectMeta.Name, |
| 198 | 198 |
}) |
| ... | ... |
@@ -207,7 +208,7 @@ func (d *ClusterRouter) checkRouterLogs(pod *kapi.Pod, r types.DiagnosticResult) |
| 207 | 207 |
// router checks every second. error only if failure is recent. |
| 208 | 208 |
// of course... we cannot always trust the local clock. |
| 209 | 209 |
if err == nil && time.Since(stamp).Seconds() < 30.0 {
|
| 210 |
- r.Errort("clRtPodConn", nil, clRtPodConn, log.Hash{
|
|
| 210 |
+ r.Errort("DClu2009", nil, clRtPodConn, log.Hash{
|
|
| 211 | 211 |
"reason": matches[2], |
| 212 | 212 |
"timestamp": matches[1], |
| 213 | 213 |
"podName": pod.ObjectMeta.Name, |
| ... | ... |
@@ -5,8 +5,8 @@ import ( |
| 5 | 5 |
osclient "github.com/openshift/origin/pkg/client" |
| 6 | 6 |
) |
| 7 | 7 |
|
| 8 |
-func adminCan(client *osclient.Client, ns string, sar *authorizationapi.SubjectAccessReview) (bool, error) {
|
|
| 9 |
- if resp, err := client.SubjectAccessReviews(ns).Create(sar); err != nil {
|
|
| 8 |
+func adminCan(client *osclient.Client, action authorizationapi.AuthorizationAttributes) (bool, error) {
|
|
| 9 |
+ if resp, err := client.SubjectAccessReviews().Create(&authorizationapi.SubjectAccessReview{Action: action}); err != nil {
|
|
| 10 | 10 |
return false, err |
| 11 | 11 |
} else if resp.Allowed {
|
| 12 | 12 |
return true, nil |
| ... | ... |
@@ -32,17 +32,17 @@ func (d MasterConfigCheck) CanRun() (bool, error) {
|
| 32 | 32 |
func (d MasterConfigCheck) Check() types.DiagnosticResult {
|
| 33 | 33 |
r := types.NewDiagnosticResult(MasterConfigCheckName) |
| 34 | 34 |
|
| 35 |
- r.Debugf("discMCfile", "Looking for master config file at '%s'", d.MasterConfigFile)
|
|
| 35 |
+ r.Debugf("DH0001", "Looking for master config file at '%s'", d.MasterConfigFile)
|
|
| 36 | 36 |
masterConfig, err := configapilatest.ReadAndResolveMasterConfig(d.MasterConfigFile) |
| 37 | 37 |
if err != nil {
|
| 38 |
- r.Errorf("discMCfail", err, "Could not read master config file '%s':\n(%T) %[2]v", d.MasterConfigFile, err)
|
|
| 38 |
+ r.Errorf("DH0002", err, "Could not read master config file '%s':\n(%T) %[2]v", d.MasterConfigFile, err)
|
|
| 39 | 39 |
return r |
| 40 | 40 |
} |
| 41 | 41 |
|
| 42 |
- r.Infof("discMCfound", "Found a master config file: %[1]s", d.MasterConfigFile)
|
|
| 42 |
+ r.Infof("DH0003", "Found a master config file: %[1]s", d.MasterConfigFile)
|
|
| 43 | 43 |
|
| 44 | 44 |
for _, err := range configvalidation.ValidateMasterConfig(masterConfig).Errors {
|
| 45 |
- r.Errorf("discMCinvalid", err, "Validation of master config file '%s' failed:\n(%T) %[2]v", d.MasterConfigFile, err)
|
|
| 45 |
+ r.Errorf("DH0004", err, "Validation of master config file '%s' failed:\n(%T) %[2]v", d.MasterConfigFile, err)
|
|
| 46 | 46 |
} |
| 47 | 47 |
return r |
| 48 | 48 |
} |
| ... | ... |
@@ -31,17 +31,17 @@ func (d NodeConfigCheck) CanRun() (bool, error) {
|
| 31 | 31 |
} |
| 32 | 32 |
func (d NodeConfigCheck) Check() types.DiagnosticResult {
|
| 33 | 33 |
r := types.NewDiagnosticResult(NodeConfigCheckName) |
| 34 |
- r.Debugf("discNCfile", "Looking for node config file at '%s'", d.NodeConfigFile)
|
|
| 34 |
+ r.Debugf("DH1001", "Looking for node config file at '%s'", d.NodeConfigFile)
|
|
| 35 | 35 |
nodeConfig, err := configapilatest.ReadAndResolveNodeConfig(d.NodeConfigFile) |
| 36 | 36 |
if err != nil {
|
| 37 |
- r.Errorf("discNCfail", err, "Could not read node config file '%s':\n(%T) %[2]v", d.NodeConfigFile, err)
|
|
| 37 |
+ r.Errorf("DH1002", err, "Could not read node config file '%s':\n(%T) %[2]v", d.NodeConfigFile, err)
|
|
| 38 | 38 |
return r |
| 39 | 39 |
} |
| 40 | 40 |
|
| 41 |
- r.Infof("discNCfound", "Found a node config file: %[1]s", d.NodeConfigFile)
|
|
| 41 |
+ r.Infof("DH1003", "Found a node config file: %[1]s", d.NodeConfigFile)
|
|
| 42 | 42 |
|
| 43 | 43 |
for _, err := range configvalidation.ValidateNodeConfig(nodeConfig) {
|
| 44 |
- r.Errorf("discNCinvalid", err, "Validation of node config file '%s' failed:\n(%T) %[2]v", d.NodeConfigFile, err)
|
|
| 44 |
+ r.Errorf("DH1004", err, "Validation of node config file '%s' failed:\n(%T) %[2]v", d.NodeConfigFile, err)
|
|
| 45 | 45 |
} |
| 46 | 46 |
return r |
| 47 | 47 |
} |
| ... | ... |
@@ -151,15 +151,15 @@ var ( |
| 151 | 151 |
|
| 152 | 152 |
// Provide a summary at the end |
| 153 | 153 |
func (l *Logger) Summary(warningsSeen int, errorsSeen int) {
|
| 154 |
- l.Noticef("summary", "\nSummary of diagnostics execution (version %v):\n", version.Get())
|
|
| 154 |
+ l.Noticef("DL0001", "\nSummary of diagnostics execution (version %v):\n", version.Get())
|
|
| 155 | 155 |
if warningsSeen > 0 {
|
| 156 |
- l.Noticet("sumWarn", "Warnings seen: {{.warnings}}", Hash{"warnings": warningsSeen})
|
|
| 156 |
+ l.Noticet("DL0002", "Warnings seen: {{.warnings}}", Hash{"warnings": warningsSeen})
|
|
| 157 | 157 |
} |
| 158 | 158 |
if errorsSeen > 0 {
|
| 159 |
- l.Noticet("sumErr", "Errors seen: {{.errors}}", Hash{"errors": errorsSeen})
|
|
| 159 |
+ l.Noticet("DL0003", "Errors seen: {{.errors}}", Hash{"errors": errorsSeen})
|
|
| 160 | 160 |
} |
| 161 | 161 |
if warningsSeen == 0 && errorsSeen == 0 {
|
| 162 |
- l.Notice("sumNone", "Completed with no errors or warnings seen.")
|
|
| 162 |
+ l.Notice("DL0004", "Completed with no errors or warnings seen.")
|
|
| 163 | 163 |
} |
| 164 | 164 |
} |
| 165 | 165 |
|
| ... | ... |
@@ -257,13 +257,13 @@ func origin(skip int) string {
|
| 257 | 257 |
} |
| 258 | 258 |
} |
| 259 | 259 |
func (l *Logger) logp(level Level, id string, text string) {
|
| 260 |
- l.LogEntry(Entry{id, origin(1), level, Message{ID: id, EvaluatedText: text}})
|
|
| 260 |
+ l.LogEntry(Entry{id, origin(2), level, Message{ID: id, EvaluatedText: text}})
|
|
| 261 | 261 |
} |
| 262 | 262 |
func (l *Logger) logf(level Level, id string, msg string, a ...interface{}) {
|
| 263 |
- l.LogEntry(Entry{id, origin(1), level, Message{ID: id, EvaluatedText: fmt.Sprintf(msg, a...)}})
|
|
| 263 |
+ l.LogEntry(Entry{id, origin(2), level, Message{ID: id, EvaluatedText: fmt.Sprintf(msg, a...)}})
|
|
| 264 | 264 |
} |
| 265 | 265 |
func (l *Logger) logt(level Level, id string, template string, data interface{}) {
|
| 266 |
- l.LogEntry(Entry{id, origin(1), level, Message{ID: id, Template: template, TemplateData: data}})
|
|
| 266 |
+ l.LogEntry(Entry{id, origin(2), level, Message{ID: id, Template: template, TemplateData: data}})
|
|
| 267 | 267 |
} |
| 268 | 268 |
|
| 269 | 269 |
func (l *Logger) Finish() {
|
| ... | ... |
@@ -37,7 +37,7 @@ func (t *textLogger) Write(entry Entry) {
|
| 37 | 37 |
} |
| 38 | 38 |
text := strings.TrimSpace(entry.Message.EvaluatedText) |
| 39 | 39 |
if entry.Level.Level >= WarnLevel.Level {
|
| 40 |
- text = fmt.Sprintf("[ID \"%s\" from %s]\n", entry.ID, entry.Origin) + text
|
|
| 40 |
+ text = fmt.Sprintf("[%s from %s]\n", entry.ID, entry.Origin) + text
|
|
| 41 | 41 |
} |
| 42 | 42 |
if strings.Contains(text, "\n") { // separate multiline comments with newlines
|
| 43 | 43 |
if !t.lastNewline {
|
| ... | ... |
@@ -42,7 +42,7 @@ func (d AnalyzeLogs) Check() types.DiagnosticResult {
|
| 42 | 42 |
|
| 43 | 43 |
for _, unit := range unitLogSpecs {
|
| 44 | 44 |
if svc := d.SystemdUnits[unit.Name]; svc.Enabled || svc.Active {
|
| 45 |
- r.Infof("sdCheckLogs", "Checking journalctl logs for '%s' service", unit.Name)
|
|
| 45 |
+ r.Infof("DS0001", "Checking journalctl logs for '%s' service", unit.Name)
|
|
| 46 | 46 |
|
| 47 | 47 |
cmd := exec.Command("journalctl", "-ru", unit.Name, "--output=json")
|
| 48 | 48 |
// JSON comes out of journalctl one line per record |
| ... | ... |
@@ -58,7 +58,7 @@ func (d AnalyzeLogs) Check() types.DiagnosticResult {
|
| 58 | 58 |
}(cmd) |
| 59 | 59 |
|
| 60 | 60 |
if err != nil {
|
| 61 |
- r.Errorf("sdLogReadErr", err, sdLogReadErr, unit.Name, errStr(err))
|
|
| 61 |
+ r.Errorf("DS0002", err, sdLogReadErr, unit.Name, errStr(err))
|
|
| 62 | 62 |
return r |
| 63 | 63 |
} |
| 64 | 64 |
defer func() { // close out pipe once done reading
|
| ... | ... |
@@ -75,10 +75,10 @@ func (d AnalyzeLogs) Check() types.DiagnosticResult {
|
| 75 | 75 |
} |
| 76 | 76 |
bytes, entry := lineReader.Bytes(), logEntry{}
|
| 77 | 77 |
if err := json.Unmarshal(bytes, &entry); err != nil {
|
| 78 |
- r.Debugf("sdLogBadJSON", "Couldn't read the JSON for this log message:\n%s\nGot error %s", string(bytes), errStr(err))
|
|
| 78 |
+ r.Debugf("DS0003", "Couldn't read the JSON for this log message:\n%s\nGot error %s", string(bytes), errStr(err))
|
|
| 79 | 79 |
} else {
|
| 80 | 80 |
if lineCount > 500 && stampTooOld(entry.TimeStamp, timeLimit) {
|
| 81 |
- r.Debugf("sdLogTrunc", "Stopped reading %s log: timestamp %s too old", unit.Name, entry.TimeStamp)
|
|
| 81 |
+ r.Debugf("DS0004", "Stopped reading %s log: timestamp %s too old", unit.Name, entry.TimeStamp)
|
|
| 82 | 82 |
break // if we've analyzed at least 500 entries, stop when age limit reached (don't scan days of logs) |
| 83 | 83 |
} |
| 84 | 84 |
if unit.StartMatch.MatchString(entry.Message) {
|
| ... | ... |
@@ -12,23 +12,23 @@ import ( |
| 12 | 12 |
func GetSystemdUnits(logger *log.Logger) map[string]types.SystemdUnit {
|
| 13 | 13 |
systemdUnits := map[string]types.SystemdUnit{}
|
| 14 | 14 |
|
| 15 |
- logger.Notice("discBeginSysd", "Performing systemd discovery")
|
|
| 15 |
+ logger.Notice("DS1001", "Performing systemd discovery")
|
|
| 16 | 16 |
for _, name := range []string{"openshift", "openshift-master", "openshift-node", "openshift-sdn-master", "openshift-sdn-node", "docker", "openvswitch", "iptables", "etcd", "kubernetes"} {
|
| 17 | 17 |
systemdUnits[name] = discoverSystemdUnit(logger, name) |
| 18 | 18 |
|
| 19 | 19 |
if systemdUnits[name].Exists {
|
| 20 |
- logger.Debugf("discUnit", "Saw systemd unit %s", name)
|
|
| 20 |
+ logger.Debugf("DS1002", "Saw systemd unit %s", name)
|
|
| 21 | 21 |
} |
| 22 | 22 |
} |
| 23 | 23 |
|
| 24 |
- logger.Debugf("discUnits", "%v", systemdUnits)
|
|
| 24 |
+ logger.Debugf("DS1003", "%v", systemdUnits)
|
|
| 25 | 25 |
return systemdUnits |
| 26 | 26 |
} |
| 27 | 27 |
|
| 28 | 28 |
func discoverSystemdUnit(logger *log.Logger, name string) types.SystemdUnit {
|
| 29 | 29 |
unit := types.SystemdUnit{Name: name, Exists: false}
|
| 30 | 30 |
if output, err := exec.Command("systemctl", "show", name).Output(); err != nil {
|
| 31 |
- logger.Errorf("discCtlErr", "Error running `systemctl show %s`: %s\nCannot analyze systemd units.", name, err.Error())
|
|
| 31 |
+ logger.Errorf("DS1004", "Error running `systemctl show %s`: %s\nCannot analyze systemd units.", name, err.Error())
|
|
| 32 | 32 |
|
| 33 | 33 |
} else {
|
| 34 | 34 |
attr := make(map[string]string) |
| ... | ... |
@@ -40,7 +40,7 @@ func discoverSystemdUnit(logger *log.Logger, name string) types.SystemdUnit {
|
| 40 | 40 |
} |
| 41 | 41 |
|
| 42 | 42 |
if val := attr["LoadState"]; val != "loaded" {
|
| 43 |
- logger.Debugf("discUnitENoExist", "systemd unit '%s' does not exist. LoadState is '%s'", name, val)
|
|
| 43 |
+ logger.Debugf("DS1005", "systemd unit '%s' does not exist. LoadState is '%s'", name, val)
|
|
| 44 | 44 |
return unit // doesn't exist - leave everything blank |
| 45 | 45 |
|
| 46 | 46 |
} else {
|
| ... | ... |
@@ -48,19 +48,19 @@ func discoverSystemdUnit(logger *log.Logger, name string) types.SystemdUnit {
|
| 48 | 48 |
} |
| 49 | 49 |
|
| 50 | 50 |
if val := attr["UnitFileState"]; val == "enabled" {
|
| 51 |
- logger.Debugf("discUnitEnabled", "systemd unit '%s' is enabled - it will start automatically at boot.", name)
|
|
| 51 |
+ logger.Debugf("DS1006", "systemd unit '%s' is enabled - it will start automatically at boot.", name)
|
|
| 52 | 52 |
unit.Enabled = true |
| 53 | 53 |
|
| 54 | 54 |
} else {
|
| 55 |
- logger.Debugf("discUnitNoEnable", "systemd unit '%s' is not enabled - it does not start automatically at boot. UnitFileState is '%s'", name, val)
|
|
| 55 |
+ logger.Debugf("DS1007", "systemd unit '%s' is not enabled - it does not start automatically at boot. UnitFileState is '%s'", name, val)
|
|
| 56 | 56 |
} |
| 57 | 57 |
|
| 58 | 58 |
if val := attr["ActiveState"]; val == "active" {
|
| 59 |
- logger.Debugf("discUnitActive", "systemd unit '%s' is currently running", name)
|
|
| 59 |
+ logger.Debugf("DS1008", "systemd unit '%s' is currently running", name)
|
|
| 60 | 60 |
unit.Active = true |
| 61 | 61 |
|
| 62 | 62 |
} else {
|
| 63 |
- logger.Debugf("discUnitNoActive", "systemd unit '%s' is not currently running. ActiveState is '%s'; exit code was %d.", name, val, unit.ExitStatus)
|
|
| 63 |
+ logger.Debugf("DS1009", "systemd unit '%s' is not currently running. ActiveState is '%s'; exit code was %d.", name, val, unit.ExitStatus)
|
|
| 64 | 64 |
} |
| 65 | 65 |
|
| 66 | 66 |
fmt.Sscanf(attr["StatusErrno"], "%d", &unit.ExitStatus) // ignore errors... |
| ... | ... |
@@ -61,26 +61,16 @@ var tlsClientErrorSeen map[string]bool |
| 61 | 61 |
var unitLogSpecs = []*unitSpec{
|
| 62 | 62 |
{
|
| 63 | 63 |
Name: "openshift-master", |
| 64 |
- StartMatch: regexp.MustCompile("Starting master on"),
|
|
| 64 |
+ StartMatch: regexp.MustCompile("Starting \\w+ Master"),
|
|
| 65 | 65 |
LogMatchers: []logMatcher{
|
| 66 | 66 |
badImageTemplate, |
| 67 | 67 |
{
|
| 68 | 68 |
Regexp: regexp.MustCompile("Unable to decode an event from the watch stream: local error: unexpected message"),
|
| 69 | 69 |
Level: log.InfoLevel, |
| 70 |
- Id: "sdLogOMIgnore", |
|
| 70 |
+ Id: "DS2003", |
|
| 71 | 71 |
Interpretation: "You can safely ignore this message.", |
| 72 | 72 |
}, |
| 73 | 73 |
{
|
| 74 |
- Regexp: regexp.MustCompile("HTTP probe error: Get .*/healthz: dial tcp .*:10250: connection refused"),
|
|
| 75 |
- Level: log.InfoLevel, |
|
| 76 |
- Id: "sdLogOMhzRef", |
|
| 77 |
- Interpretation: ` |
|
| 78 |
-The master does a health check on nodes that are defined in its records, |
|
| 79 |
-and this error is the result when the node is not available yet. |
|
| 80 |
-This is not usually a problem, unless it continues in the logs after |
|
| 81 |
-the node is actually available.`, |
|
| 82 |
- }, |
|
| 83 |
- {
|
|
| 84 | 74 |
// TODO: don't rely on ipv4 format, should be ipv6 "soon" |
| 85 | 75 |
Regexp: regexp.MustCompile("http: TLS handshake error from ([\\d.]+):\\d+: remote error: bad certificate"),
|
| 86 | 76 |
Level: log.WarnLevel, |
| ... | ... |
@@ -90,7 +80,7 @@ the node is actually available.`, |
| 90 | 90 |
if tlsClientErrorSeen == nil { // first time this message was seen
|
| 91 | 91 |
tlsClientErrorSeen = map[string]bool{client: true}
|
| 92 | 92 |
// TODO: too generic, adjust message depending on subnet of the "from" address |
| 93 |
- r.Warn("sdLogOMreBadCert", nil, prelude+`
|
|
| 93 |
+ r.Warn("DS2001", nil, prelude+`
|
|
| 94 | 94 |
This error indicates that a client attempted to connect to the master |
| 95 | 95 |
HTTPS API server but broke off the connection because the master's |
| 96 | 96 |
certificate is not validated by a cerificate authority (CA) acceptable |
| ... | ... |
@@ -101,8 +91,8 @@ At this time, the master API certificate is signed by a private CA |
| 101 | 101 |
(created the first time the master runs) and clients should have a copy of |
| 102 | 102 |
that CA certificate in order to validate connections to the master. Most |
| 103 | 103 |
likely, either: |
| 104 |
-1. the master has generated a new CA (after the administrator deleted |
|
| 105 |
- the old one) and the client has a copy of the old CA cert, or |
|
| 104 |
+1. the master has generated a new CA (e.g. after the administrator |
|
| 105 |
+ deleted the old one) and the client has a copy of the old CA cert, or |
|
| 106 | 106 |
2. the client hasn't been configured with a private CA at all (or the |
| 107 | 107 |
wrong one), or |
| 108 | 108 |
3. the client is attempting to reach the master at a URL that isn't |
| ... | ... |
@@ -131,99 +121,66 @@ log message: |
| 131 | 131 |
|
| 132 | 132 |
} else if !tlsClientErrorSeen[client] {
|
| 133 | 133 |
tlsClientErrorSeen[client] = true |
| 134 |
- r.Warn("sdLogOMreBadCert", nil, prelude+`This message was diagnosed above, but for a different client address.`)
|
|
| 134 |
+ r.Warn("DS2002", nil, prelude+`This message was diagnosed above, but for a different client address.`)
|
|
| 135 | 135 |
} // else, it's a repeat, don't mention it |
| 136 | 136 |
return true // show once for every client failing to connect, not just the first |
| 137 | 137 |
}, |
| 138 | 138 |
}, |
| 139 |
- {
|
|
| 140 |
- // user &{system:anonymous [system:unauthenticated]} -> /api/v\\w+/services?namespace="
|
|
| 141 |
- Regexp: regexp.MustCompile("system:anonymous\\W*system:unauthenticated\\W*/api/v\\w+/services\\?namespace="),
|
|
| 142 |
- Level: log.WarnLevel, |
|
| 143 |
- Id: "sdLogOMunauthNode", |
|
| 144 |
- Interpretation: ` |
|
| 145 |
-This indicates the API server (master) received an unscoped request to |
|
| 146 |
-get Services. Requests like this probably come from a node trying to |
|
| 147 |
-discover where it should proxy services. |
|
| 148 |
- |
|
| 149 |
-However, the request was unauthenticated, so it was denied. The node |
|
| 150 |
-either did not offer a client certificate for credential, or offered an |
|
| 151 |
-invalid one (not signed by the certificate authority the master uses). |
|
| 152 |
-The node will not be able to function without this access. |
|
| 153 |
- |
|
| 154 |
-Unfortunately, this message does not tell us *which* node is the |
|
| 155 |
-problem. But running diagnostics on your node hosts should find a log |
|
| 156 |
-message for any node with this problem. |
|
| 157 |
-`, |
|
| 158 |
- }, |
|
| 159 | 139 |
}, |
| 160 | 140 |
}, |
| 161 | 141 |
{
|
| 162 | 142 |
Name: "openshift-node", |
| 163 |
- StartMatch: regexp.MustCompile("Starting OpenShift node"), //systemd puts this out; could change
|
|
| 143 |
+ StartMatch: regexp.MustCompile("Starting \\w+ Node"), //systemd puts this out; could change
|
|
| 164 | 144 |
LogMatchers: []logMatcher{
|
| 165 | 145 |
badImageTemplate, |
| 166 | 146 |
{
|
| 167 |
- Regexp: regexp.MustCompile("Unable to load services: Get (http\\S+/api/v\\w+/services\\?namespace=): (.+)"), // e.g. x509: certificate signed by unknown authority
|
|
| 147 |
+ Regexp: regexp.MustCompile(`Unable to register.*"system:anonymous"`), |
|
| 168 | 148 |
Level: log.ErrorLevel, |
| 169 |
- Id: "sdLogONconnMaster", |
|
| 149 |
+ Id: "DS2004", |
|
| 170 | 150 |
Interpretation: ` |
| 171 |
-openshift-node could not connect to the master API in order to determine |
|
| 172 |
-its responsibilities. This host will not function as a node until this |
|
| 173 |
-is resolved. Pods scheduled for this node will remain in pending or |
|
| 174 |
-unknown state forever.`, |
|
| 175 |
- }, |
|
| 176 |
- {
|
|
| 177 |
- Regexp: regexp.MustCompile(`Unable to load services: request.*403 Forbidden: Forbidden: "/api/v\w+/services\?namespace=" denied by default`), |
|
| 178 |
- Level: log.ErrorLevel, |
|
| 179 |
- Id: "sdLogONMasterForbids", |
|
| 180 |
- Interpretation: ` |
|
| 181 |
-openshift-node could not connect to the master API to determine |
|
| 182 |
-its responsibilities because it lacks the proper credentials. Nodes |
|
| 183 |
-should specify a client certificate in order to identify themselves to |
|
| 184 |
-the master. This message typically means that either no client key/cert |
|
| 185 |
-was supplied, or it is not validated by the certificate authority (CA) |
|
| 186 |
-the master uses. You should supply a correct client key and certificate |
|
| 187 |
-in the .kubeconfig specified in node-config.yaml |
|
| 151 |
+openshift-node could not register with the master API because it lacks |
|
| 152 |
+the proper credentials. Nodes should specify a client certificate in |
|
| 153 |
+order to identify themselves to the master. This message typically means |
|
| 154 |
+that either no client key/cert was supplied, or it is not validated |
|
| 155 |
+by the certificate authority (CA) the master uses. You should supply |
|
| 156 |
+a correct client key and certificate in the .kubeconfig specified in |
|
| 157 |
+node-config.yaml |
|
| 188 | 158 |
|
| 189 | 159 |
This host will not function as a node until this is resolved. Pods |
| 190 | 160 |
scheduled for this node will remain in pending or unknown state forever.`, |
| 191 | 161 |
}, |
| 192 | 162 |
{
|
| 193 |
- Regexp: regexp.MustCompile("Could not find an allocated subnet for this minion.*Waiting.."),
|
|
| 163 |
+ Regexp: regexp.MustCompile("Could not find an allocated subnet for"),
|
|
| 194 | 164 |
Level: log.WarnLevel, |
| 195 |
- Id: "sdLogOSNnoSubnet", |
|
| 165 |
+ Id: "DS2005", |
|
| 196 | 166 |
Interpretation: ` |
| 197 | 167 |
This warning occurs when openshift-node is trying to request the |
| 198 | 168 |
SDN subnet it should be configured with according to the master, |
| 199 |
-but either can't connect to it ("All the given peers are not reachable")
|
|
| 200 |
-or has not yet been assigned a subnet ("Key not found").
|
|
| 169 |
+but either can't connect to it or has not yet been assigned a subnet. |
|
| 201 | 170 |
|
| 202 |
-This can just be a matter of waiting for the master to become fully |
|
| 203 |
-available and define a record for the node (aka "minion") to use, |
|
| 204 |
-and openshift-node will wait until that occurs, so the presence |
|
| 205 |
-of this message in the node log isn't necessarily a problem as |
|
| 206 |
-long as the SDN is actually working, but this message may help indicate |
|
| 207 |
-the problem if it is not working. |
|
| 171 |
+This can occur before the master becomes fully available and defines a |
|
| 172 |
+record for the node to use; openshift-node will wait until that occurs, |
|
| 173 |
+so the presence of this message in the node log isn't necessarily a |
|
| 174 |
+problem as long as the SDN is actually working, but this message may |
|
| 175 |
+help indicate the problem if it is not working. |
|
| 208 | 176 |
|
| 209 |
-If the master is available and this node's record is defined and this |
|
| 210 |
-message persists, then it may be a sign of a different misconfiguration. |
|
| 211 |
-Unfortunately the message is not specific about why the connection failed. |
|
| 212 |
-Check the master's URL in the node configuration. |
|
| 177 |
+If the master is available and this log message persists, then it may |
|
| 178 |
+be a sign of a different misconfiguration. Check the master's URL in |
|
| 179 |
+the node kubeconfig. |
|
| 213 | 180 |
* Is the protocol http? It should be https. |
| 214 |
- * Can you reach the address and port from the node using curl? |
|
| 215 |
- ("404 page not found" is correct response)`,
|
|
| 181 |
+ * Can you reach the address and port from the node using curl -k? |
|
| 182 |
+`, |
|
| 216 | 183 |
}, |
| 217 | 184 |
}, |
| 218 | 185 |
}, |
| 219 | 186 |
{
|
| 220 | 187 |
Name: "docker", |
| 221 |
- StartMatch: regexp.MustCompile(`Starting Docker Application Container Engine.`), // RHEL Docker at least |
|
| 188 |
+ StartMatch: regexp.MustCompile(`Starting Docker`), // RHEL Docker at least |
|
| 222 | 189 |
LogMatchers: []logMatcher{
|
| 223 | 190 |
{
|
| 224 | 191 |
Regexp: regexp.MustCompile(`Usage: docker \\[OPTIONS\\] COMMAND`), |
| 225 | 192 |
Level: log.ErrorLevel, |
| 226 |
- Id: "sdLogDbadOpt", |
|
| 193 |
+ Id: "DS2006", |
|
| 227 | 194 |
Interpretation: ` |
| 228 | 195 |
This indicates that docker failed to parse its command line |
| 229 | 196 |
successfully, so it just printed a standard usage message and exited. |
| ... | ... |
@@ -236,7 +193,7 @@ The node will not run on this host until this is resolved.`, |
| 236 | 236 |
{
|
| 237 | 237 |
Regexp: regexp.MustCompile(`^Unable to open the database file: unable to open database file$`), |
| 238 | 238 |
Level: log.ErrorLevel, |
| 239 |
- Id: "sdLogDopenDB", |
|
| 239 |
+ Id: "DS2007", |
|
| 240 | 240 |
Interpretation: ` |
| 241 | 241 |
This indicates that docker failed to record its state to its database. |
| 242 | 242 |
The most likely reason is that it is out of disk space. It is also |
| ... | ... |
@@ -254,7 +211,7 @@ The node will not run on this host until this is resolved.`, |
| 254 | 254 |
{
|
| 255 | 255 |
Regexp: regexp.MustCompile(`no space left on device$`), |
| 256 | 256 |
Level: log.ErrorLevel, |
| 257 |
- Id: "sdLogDfull", |
|
| 257 |
+ Id: "DS2008", |
|
| 258 | 258 |
Interpretation: ` |
| 259 | 259 |
This indicates that docker has run out of space for container volumes |
| 260 | 260 |
or metadata (by default, stored in /var/lib/docker, but configurable). |
| ... | ... |
@@ -272,7 +229,7 @@ The node will not run on this host until this is resolved.`, |
| 272 | 272 |
{ // generic error seen - do this last
|
| 273 | 273 |
Regexp: regexp.MustCompile(`\\slevel="fatal"\\s`), |
| 274 | 274 |
Level: log.ErrorLevel, |
| 275 |
- Id: "sdLogDfatal", |
|
| 275 |
+ Id: "DS2009", |
|
| 276 | 276 |
Interpretation: ` |
| 277 | 277 |
This is not a known problem, but it is causing Docker to crash, |
| 278 | 278 |
so the node will not run on this host until it is resolved.`, |
| ... | ... |
@@ -46,7 +46,7 @@ func (d UnitStatus) Check() types.DiagnosticResult {
|
| 46 | 46 |
// Anything that is enabled but not running deserves notice |
| 47 | 47 |
for name, unit := range d.SystemdUnits {
|
| 48 | 48 |
if unit.Enabled && !unit.Active {
|
| 49 |
- r.Errort("sdUnitInactive", nil, sdUnitInactive, log.Hash{"unit": name})
|
|
| 49 |
+ r.Errort("DS3001", nil, sdUnitInactive, log.Hash{"unit": name})
|
|
| 50 | 50 |
} |
| 51 | 51 |
} |
| 52 | 52 |
return r |
| ... | ... |
@@ -56,9 +56,9 @@ func unitRequiresUnit(r types.DiagnosticResult, unit types.SystemdUnit, requires |
| 56 | 56 |
templateData := log.Hash{"unit": unit.Name, "required": requires.Name, "reason": reason}
|
| 57 | 57 |
|
| 58 | 58 |
if (unit.Active || unit.Enabled) && !requires.Exists {
|
| 59 |
- r.Errort("sdUnitReqLoaded", nil, sdUnitReqLoaded, templateData)
|
|
| 59 |
+ r.Errort("DS3002", nil, sdUnitReqLoaded, templateData)
|
|
| 60 | 60 |
} else if unit.Active && !requires.Active {
|
| 61 |
- r.Errort("sdUnitReqActive", nil, sdUnitReqActive, templateData)
|
|
| 61 |
+ r.Errort("DS3003", nil, sdUnitReqActive, templateData)
|
|
| 62 | 62 |
} |
| 63 | 63 |
} |
| 64 | 64 |
|