Browse code

Build retry loop for screen sessions

There is a timing window where we might lose the commands being
stuffed into screen because bash is spawning. In those cases, loop
around and try building screen sessions again.

Change-Id: I49247de06bbd59424cb10fb9a8db145907be5138
Related-Bug: #1331274
(cherry picked from commit 0afa912e99dc9bad8b490960beb8f0cf85750dcc)

Sean Dague authored on 2014/06/19 04:36:19
Showing 1 changed files
... ...
@@ -1044,44 +1044,100 @@ function run_process {
1044 1044
     echo $!
1045 1045
 }
1046 1046
 
1047
+function _start_in_screen {
1048
+    local service=$1
1049
+    local cmd=$2
1050
+    local screen_name=${SCREEN_NAME:-stack}
1051
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
1052
+    local service_dir="$status_dir/$screen_name"
1053
+    local pid="$service_dir/$service.pid"
1054
+    local failure="$service_dir/$service.failure"
1055
+
1056
+    if [[ -n ${SCREEN_LOGDIR} ]]; then
1057
+        local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
1058
+        local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
1059
+        # this whole dance is done because of slow nodes
1060
+        screen -S $screen_name -p $service -X logfile ${logfile}
1061
+        screen -S $screen_name -p $service -X log on
1062
+        ln -sf ${logfile} ${shortlog}
1063
+    fi
1064
+
1065
+    NL=`echo -ne '\015'`
1066
+    # This fun command does the following:
1067
+    # - the passed server command is backgrounded
1068
+    # - the pid of the background process is saved in the usual place
1069
+    # - the server process is brought back to the foreground
1070
+    # - if the server process exits prematurely the fg command errors
1071
+    #   and a message is written to stdout and the service failure file
1072
+    # The pid saved can be used in screen_stop() as a process group
1073
+    # id to kill off all child processes
1074
+    echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
1075
+    screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
1076
+}
1077
+
1078
+
1079
+function _is_running_in_screen {
1080
+    local service=$1
1081
+    local screen_name=${SCREEN_NAME:-stack}
1082
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
1083
+    local service_dir="$status_dir/$screen_name"
1084
+    local pid="$service_dir/$service.pid"
1085
+    local failure="$service_dir/$service.failure"
1086
+    if [[ ! -e "$pid" && ! -e "$failure" ]]; then
1087
+        # if we don't have a pid or a failure for why, the command may not
1088
+        # have stuffed in there
1089
+        echo "Warning: neither $pid nor $failure exist, $service didn't seem to start"
1090
+        return 1
1091
+    fi
1092
+    if [[ -n ${SCREEN_LOGDIR} ]]; then
1093
+        # if we should be logging, but we don't have a log file, something is wrong
1094
+        local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
1095
+        if [[ ! -e "$logfile" ]]; then
1096
+            echo "Warning: expected logfile $logfile not found, something wrong with starting $service"
1097
+            return 1
1098
+        fi
1099
+    fi
1100
+    return 0
1101
+}
1102
+
1047 1103
 # Helper to launch a service in a named screen
1048 1104
 # screen_it service "command-line"
1049 1105
 function screen_it {
1050
-    SCREEN_NAME=${SCREEN_NAME:-stack}
1051
-    SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
1052
-    USE_SCREEN=$(trueorfalse True $USE_SCREEN)
1106
+    local service=$1
1107
+    local cmd=$2
1108
+    local screen_name=${SCREEN_NAME:-stack}
1109
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
1110
+    local service_dir="$status_dir/$screen_name"
1111
+    local use_screen=$(trueorfalse True $USE_SCREEN)
1112
+    local pid="$service_dir/$service.pid"
1053 1113
 
1054 1114
     if is_service_enabled $1; then
1055 1115
         # Append the service to the screen rc file
1056
-        screen_rc "$1" "$2"
1057
-
1058
-        if [[ "$USE_SCREEN" = "True" ]]; then
1059
-            screen -S $SCREEN_NAME -X screen -t $1
1060
-
1061
-            if [[ -n ${SCREEN_LOGDIR} ]]; then
1062
-                screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
1063
-                screen -S $SCREEN_NAME -p $1 -X log on
1064
-                ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
1065
-            fi
1066
-
1067
-            # sleep to allow bash to be ready to be send the command - we are
1068
-            # creating a new window in screen and then sends characters, so if
1069
-            # bash isn't running by the time we send the command, nothing happens
1070
-            sleep 3
1071
-
1072
-            NL=`echo -ne '\015'`
1073
-            # This fun command does the following:
1074
-            # - the passed server command is backgrounded
1075
-            # - the pid of the background process is saved in the usual place
1076
-            # - the server process is brought back to the foreground
1077
-            # - if the server process exits prematurely the fg command errors
1078
-            #   and a message is written to stdout and the service failure file
1079
-            # The pid saved can be used in screen_stop() as a process group
1080
-            # id to kill off all child processes
1081
-            screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL"
1116
+        screen_rc "$service" "$cmd"
1117
+
1118
+        if [[ "$use_screen" = "True" ]]; then
1119
+            screen -S $screen_name -X screen -t $service
1120
+
1121
+            # this retry loop brought to you by slow compute nodes, screen raciness,
1122
+            # and frustration in upgrading.
1123
+            local screen_tries=0
1124
+            while [ "$screen_tries" -lt 10 ]; do
1125
+                _start_in_screen "$service" "$cmd"
1126
+                if _is_running_in_screen $service; then
1127
+                    screen_tries=10
1128
+                else
1129
+                    screen_tries=$[screen_tries + 1]
1130
+                    echo "Failed to start service after $screen_tries attempt(s), retrying"
1131
+                    if [[ "$screen_tries" -eq 10 ]]; then
1132
+                        echo "Too many retries, giving up"
1133
+                        exit 1
1134
+                    fi
1135
+                    sleep 1
1136
+                fi
1137
+            done
1082 1138
         else
1083 1139
             # Spawn directly without screen
1084
-            run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid
1140
+            run_process "$service" "$cmd" >$pid
1085 1141
         fi
1086 1142
     fi
1087 1143
 }