There is a timing window where we might lose the commands being
stuffed into screen because bash is spawning. In those cases, loop
around and try building screen sessions again.
Change-Id: I49247de06bbd59424cb10fb9a8db145907be5138
Related-Bug: #1331274
(cherry picked from commit 0afa912e99dc9bad8b490960beb8f0cf85750dcc)
| ... | ... |
@@ -1044,44 +1044,100 @@ function run_process {
|
| 1044 | 1044 |
echo $! |
| 1045 | 1045 |
} |
| 1046 | 1046 |
|
| 1047 |
+function _start_in_screen {
|
|
| 1048 |
+ local service=$1 |
|
| 1049 |
+ local cmd=$2 |
|
| 1050 |
+ local screen_name=${SCREEN_NAME:-stack}
|
|
| 1051 |
+ local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1052 |
+ local service_dir="$status_dir/$screen_name" |
|
| 1053 |
+ local pid="$service_dir/$service.pid" |
|
| 1054 |
+ local failure="$service_dir/$service.failure" |
|
| 1055 |
+ |
|
| 1056 |
+ if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1057 |
+ local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
|
| 1058 |
+ local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
|
|
| 1059 |
+ # this whole dance is done because of slow nodes |
|
| 1060 |
+ screen -S $screen_name -p $service -X logfile ${logfile}
|
|
| 1061 |
+ screen -S $screen_name -p $service -X log on |
|
| 1062 |
+ ln -sf ${logfile} ${shortlog}
|
|
| 1063 |
+ fi |
|
| 1064 |
+ |
|
| 1065 |
+ NL=`echo -ne '\015'` |
|
| 1066 |
+ # This fun command does the following: |
|
| 1067 |
+ # - the passed server command is backgrounded |
|
| 1068 |
+ # - the pid of the background process is saved in the usual place |
|
| 1069 |
+ # - the server process is brought back to the foreground |
|
| 1070 |
+ # - if the server process exits prematurely the fg command errors |
|
| 1071 |
+ # and a message is written to stdout and the service failure file |
|
| 1072 |
+ # The pid saved can be used in screen_stop() as a process group |
|
| 1073 |
+ # id to kill off all child processes |
|
| 1074 |
+ echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL" |
|
| 1075 |
+ screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL" |
|
| 1076 |
+} |
|
| 1077 |
+ |
|
| 1078 |
+ |
|
| 1079 |
+function _is_running_in_screen {
|
|
| 1080 |
+ local service=$1 |
|
| 1081 |
+ local screen_name=${SCREEN_NAME:-stack}
|
|
| 1082 |
+ local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1083 |
+ local service_dir="$status_dir/$screen_name" |
|
| 1084 |
+ local pid="$service_dir/$service.pid" |
|
| 1085 |
+ local failure="$service_dir/$service.failure" |
|
| 1086 |
+ if [[ ! -e "$pid" && ! -e "$failure" ]]; then |
|
| 1087 |
+ # if we don't have a pid or a failure for why, the command may not |
|
| 1088 |
+ # have stuffed in there |
|
| 1089 |
+ echo "Warning: neither $pid nor $failure exist, $service didn't seem to start" |
|
| 1090 |
+ return 1 |
|
| 1091 |
+ fi |
|
| 1092 |
+ if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1093 |
+ # if we should be logging, but we don't have a log file, something is wrong |
|
| 1094 |
+ local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
|
| 1095 |
+ if [[ ! -e "$logfile" ]]; then |
|
| 1096 |
+ echo "Warning: expected logfile $logfile not found, something wrong with starting $service" |
|
| 1097 |
+ return 1 |
|
| 1098 |
+ fi |
|
| 1099 |
+ fi |
|
| 1100 |
+ return 0 |
|
| 1101 |
+} |
|
| 1102 |
+ |
|
| 1047 | 1103 |
# Helper to launch a service in a named screen |
| 1048 | 1104 |
# screen_it service "command-line" |
| 1049 | 1105 |
function screen_it {
|
| 1050 |
- SCREEN_NAME=${SCREEN_NAME:-stack}
|
|
| 1051 |
- SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
|
|
| 1052 |
- USE_SCREEN=$(trueorfalse True $USE_SCREEN) |
|
| 1106 |
+ local service=$1 |
|
| 1107 |
+ local cmd=$2 |
|
| 1108 |
+ local screen_name=${SCREEN_NAME:-stack}
|
|
| 1109 |
+ local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1110 |
+ local service_dir="$status_dir/$screen_name" |
|
| 1111 |
+ local use_screen=$(trueorfalse True $USE_SCREEN) |
|
| 1112 |
+ local pid="$service_dir/$service.pid" |
|
| 1053 | 1113 |
|
| 1054 | 1114 |
if is_service_enabled $1; then |
| 1055 | 1115 |
# Append the service to the screen rc file |
| 1056 |
- screen_rc "$1" "$2" |
|
| 1057 |
- |
|
| 1058 |
- if [[ "$USE_SCREEN" = "True" ]]; then |
|
| 1059 |
- screen -S $SCREEN_NAME -X screen -t $1 |
|
| 1060 |
- |
|
| 1061 |
- if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1062 |
- screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
|
|
| 1063 |
- screen -S $SCREEN_NAME -p $1 -X log on |
|
| 1064 |
- ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
|
|
| 1065 |
- fi |
|
| 1066 |
- |
|
| 1067 |
- # sleep to allow bash to be ready to be send the command - we are |
|
| 1068 |
- # creating a new window in screen and then sends characters, so if |
|
| 1069 |
- # bash isn't running by the time we send the command, nothing happens |
|
| 1070 |
- sleep 3 |
|
| 1071 |
- |
|
| 1072 |
- NL=`echo -ne '\015'` |
|
| 1073 |
- # This fun command does the following: |
|
| 1074 |
- # - the passed server command is backgrounded |
|
| 1075 |
- # - the pid of the background process is saved in the usual place |
|
| 1076 |
- # - the server process is brought back to the foreground |
|
| 1077 |
- # - if the server process exits prematurely the fg command errors |
|
| 1078 |
- # and a message is written to stdout and the service failure file |
|
| 1079 |
- # The pid saved can be used in screen_stop() as a process group |
|
| 1080 |
- # id to kill off all child processes |
|
| 1081 |
- screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL" |
|
| 1116 |
+ screen_rc "$service" "$cmd" |
|
| 1117 |
+ |
|
| 1118 |
+ if [[ "$use_screen" = "True" ]]; then |
|
| 1119 |
+ screen -S $screen_name -X screen -t $service |
|
| 1120 |
+ |
|
| 1121 |
+ # this retry loop brought to you by slow compute nodes, screen raciness, |
|
| 1122 |
+ # and frustration in upgrading. |
|
| 1123 |
+ local screen_tries=0 |
|
| 1124 |
+ while [ "$screen_tries" -lt 10 ]; do |
|
| 1125 |
+ _start_in_screen "$service" "$cmd" |
|
| 1126 |
+ if _is_running_in_screen $service; then |
|
| 1127 |
+ screen_tries=10 |
|
| 1128 |
+ else |
|
| 1129 |
+ screen_tries=$[screen_tries + 1] |
|
| 1130 |
+ echo "Failed to start service after $screen_tries attempt(s), retrying" |
|
| 1131 |
+ if [[ "$screen_tries" -eq 10 ]]; then |
|
| 1132 |
+ echo "Too many retries, giving up" |
|
| 1133 |
+ exit 1 |
|
| 1134 |
+ fi |
|
| 1135 |
+ sleep 1 |
|
| 1136 |
+ fi |
|
| 1137 |
+ done |
|
| 1082 | 1138 |
else |
| 1083 | 1139 |
# Spawn directly without screen |
| 1084 |
- run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid |
|
| 1140 |
+ run_process "$service" "$cmd" >$pid |
|
| 1085 | 1141 |
fi |
| 1086 | 1142 |
fi |
| 1087 | 1143 |
} |