Browse code

Build retry loop for screen sessions

There is a timing window where we might lose the commands being
stuffed into screen because bash is spawning. In those cases, loop
around and try building screen sessions again.

Change-Id: I49247de06bbd59424cb10fb9a8db145907be5138
Related-Bug: #1331274

Sean Dague authored on 2014/06/19 04:36:19
Showing 1 changed files
... ...
@@ -1058,44 +1058,100 @@ function run_process {
1058 1058
     echo $!
1059 1059
 }
1060 1060
 
1061
+function _start_in_screen {
1062
+    local service=$1
1063
+    local cmd=$2
1064
+    local screen_name=${SCREEN_NAME:-stack}
1065
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
1066
+    local service_dir="$status_dir/$screen_name"
1067
+    local pid="$service_dir/$service.pid"
1068
+    local failure="$service_dir/$service.failure"
1069
+
1070
+    if [[ -n ${SCREEN_LOGDIR} ]]; then
1071
+        local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
1072
+        local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
1073
+        # this whole dance is done because of slow nodes
1074
+        screen -S $screen_name -p $service -X logfile ${logfile}
1075
+        screen -S $screen_name -p $service -X log on
1076
+        ln -sf ${logfile} ${shortlog}
1077
+    fi
1078
+
1079
+    NL=`echo -ne '\015'`
1080
+    # This fun command does the following:
1081
+    # - the passed server command is backgrounded
1082
+    # - the pid of the background process is saved in the usual place
1083
+    # - the server process is brought back to the foreground
1084
+    # - if the server process exits prematurely the fg command errors
1085
+    #   and a message is written to stdout and the service failure file
1086
+    # The pid saved can be used in screen_stop() as a process group
1087
+    # id to kill off all child processes
1088
+    echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
1089
+    screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
1090
+}
1091
+
1092
+
1093
+function _is_running_in_screen {
1094
+    local service=$1
1095
+    local screen_name=${SCREEN_NAME:-stack}
1096
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
1097
+    local service_dir="$status_dir/$screen_name"
1098
+    local pid="$service_dir/$service.pid"
1099
+    local failure="$service_dir/$service.failure"
1100
+    if [[ ! -e "$pid" && ! -e "$failure" ]]; then
1101
+        # if we don't have a pid or a failure for why, the command may not
1102
+        # have stuffed in there
1103
+        echo "Warning: neither $pid nor $failure exist, $service didn't seem to start"
1104
+        return 1
1105
+    fi
1106
+    if [[ -n ${SCREEN_LOGDIR} ]]; then
1107
+        # if we should be logging, but we don't have a log file, something is wrong
1108
+        local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
1109
+        if [[ ! -e "$logfile" ]]; then
1110
+            echo "Warning: expected logfile $logfile not found, something wrong with starting $service"
1111
+            return 1
1112
+        fi
1113
+    fi
1114
+    return 0
1115
+}
1116
+
1061 1117
 # Helper to launch a service in a named screen
1062 1118
 # screen_it service "command-line"
1063 1119
 function screen_it {
1064
-    SCREEN_NAME=${SCREEN_NAME:-stack}
1065
-    SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
1066
-    USE_SCREEN=$(trueorfalse True $USE_SCREEN)
1120
+    local service=$1
1121
+    local cmd=$2
1122
+    local screen_name=${SCREEN_NAME:-stack}
1123
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
1124
+    local service_dir="$status_dir/$screen_name"
1125
+    local use_screen=$(trueorfalse True $USE_SCREEN)
1126
+    local pid="$service_dir/$service.pid"
1067 1127
 
1068 1128
     if is_service_enabled $1; then
1069 1129
         # Append the service to the screen rc file
1070
-        screen_rc "$1" "$2"
1071
-
1072
-        if [[ "$USE_SCREEN" = "True" ]]; then
1073
-            screen -S $SCREEN_NAME -X screen -t $1
1074
-
1075
-            if [[ -n ${SCREEN_LOGDIR} ]]; then
1076
-                screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
1077
-                screen -S $SCREEN_NAME -p $1 -X log on
1078
-                ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
1079
-            fi
1080
-
1081
-            # sleep to allow bash to be ready to be send the command - we are
1082
-            # creating a new window in screen and then sends characters, so if
1083
-            # bash isn't running by the time we send the command, nothing happens
1084
-            sleep 3
1085
-
1086
-            NL=`echo -ne '\015'`
1087
-            # This fun command does the following:
1088
-            # - the passed server command is backgrounded
1089
-            # - the pid of the background process is saved in the usual place
1090
-            # - the server process is brought back to the foreground
1091
-            # - if the server process exits prematurely the fg command errors
1092
-            #   and a message is written to stdout and the service failure file
1093
-            # The pid saved can be used in screen_stop() as a process group
1094
-            # id to kill off all child processes
1095
-            screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL"
1130
+        screen_rc "$service" "$cmd"
1131
+
1132
+        if [[ "$use_screen" = "True" ]]; then
1133
+            screen -S $screen_name -X screen -t $service
1134
+
1135
+            # this retry loop brought to you by slow compute nodes, screen raciness,
1136
+            # and frustration in upgrading.
1137
+            local screen_tries=0
1138
+            while [ "$screen_tries" -lt 10 ]; do
1139
+                _start_in_screen "$service" "$cmd"
1140
+                if _is_running_in_screen $service; then
1141
+                    screen_tries=10
1142
+                else
1143
+                    screen_tries=$[screen_tries + 1]
1144
+                    echo "Failed to start service after $screen_tries attempt(s), retrying"
1145
+                    if [[ "$screen_tries" -eq 10 ]]; then
1146
+                        echo "Too many retries, giving up"
1147
+                        exit 1
1148
+                    fi
1149
+                    sleep 1
1150
+                fi
1151
+            done
1096 1152
         else
1097 1153
             # Spawn directly without screen
1098
-            run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid
1154
+            run_process "$service" "$cmd" >$pid
1099 1155
         fi
1100 1156
     fi
1101 1157
 }