There is a timing window where we might lose the commands being
stuffed into screen because bash is spawning. In those cases, loop
around and try building screen sessions again.
Change-Id: I49247de06bbd59424cb10fb9a8db145907be5138
Related-Bug: #1331274
| ... | ... |
@@ -1058,44 +1058,100 @@ function run_process {
|
| 1058 | 1058 |
echo $! |
| 1059 | 1059 |
} |
| 1060 | 1060 |
|
| 1061 |
+function _start_in_screen {
|
|
| 1062 |
+ local service=$1 |
|
| 1063 |
+ local cmd=$2 |
|
| 1064 |
+ local screen_name=${SCREEN_NAME:-stack}
|
|
| 1065 |
+ local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1066 |
+ local service_dir="$status_dir/$screen_name" |
|
| 1067 |
+ local pid="$service_dir/$service.pid" |
|
| 1068 |
+ local failure="$service_dir/$service.failure" |
|
| 1069 |
+ |
|
| 1070 |
+ if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1071 |
+ local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
|
| 1072 |
+ local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
|
|
| 1073 |
+ # this whole dance is done because of slow nodes |
|
| 1074 |
+ screen -S $screen_name -p $service -X logfile ${logfile}
|
|
| 1075 |
+ screen -S $screen_name -p $service -X log on |
|
| 1076 |
+ ln -sf ${logfile} ${shortlog}
|
|
| 1077 |
+ fi |
|
| 1078 |
+ |
|
| 1079 |
+ NL=`echo -ne '\015'` |
|
| 1080 |
+ # This fun command does the following: |
|
| 1081 |
+ # - the passed server command is backgrounded |
|
| 1082 |
+ # - the pid of the background process is saved in the usual place |
|
| 1083 |
+ # - the server process is brought back to the foreground |
|
| 1084 |
+ # - if the server process exits prematurely the fg command errors |
|
| 1085 |
+ # and a message is written to stdout and the service failure file |
|
| 1086 |
+ # The pid saved can be used in screen_stop() as a process group |
|
| 1087 |
+ # id to kill off all child processes |
|
| 1088 |
+ echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL" |
|
| 1089 |
+ screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL" |
|
| 1090 |
+} |
|
| 1091 |
+ |
|
| 1092 |
+ |
|
| 1093 |
+function _is_running_in_screen {
|
|
| 1094 |
+ local service=$1 |
|
| 1095 |
+ local screen_name=${SCREEN_NAME:-stack}
|
|
| 1096 |
+ local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1097 |
+ local service_dir="$status_dir/$screen_name" |
|
| 1098 |
+ local pid="$service_dir/$service.pid" |
|
| 1099 |
+ local failure="$service_dir/$service.failure" |
|
| 1100 |
+ if [[ ! -e "$pid" && ! -e "$failure" ]]; then |
|
| 1101 |
+ # if we don't have a pid or a failure for why, the command may not |
|
| 1102 |
+ # have stuffed in there |
|
| 1103 |
+ echo "Warning: neither $pid nor $failure exist, $service didn't seem to start" |
|
| 1104 |
+ return 1 |
|
| 1105 |
+ fi |
|
| 1106 |
+ if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1107 |
+ # if we should be logging, but we don't have a log file, something is wrong |
|
| 1108 |
+ local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
|
| 1109 |
+ if [[ ! -e "$logfile" ]]; then |
|
| 1110 |
+ echo "Warning: expected logfile $logfile not found, something wrong with starting $service" |
|
| 1111 |
+ return 1 |
|
| 1112 |
+ fi |
|
| 1113 |
+ fi |
|
| 1114 |
+ return 0 |
|
| 1115 |
+} |
|
| 1116 |
+ |
|
| 1061 | 1117 |
# Helper to launch a service in a named screen |
| 1062 | 1118 |
# screen_it service "command-line" |
| 1063 | 1119 |
function screen_it {
|
| 1064 |
- SCREEN_NAME=${SCREEN_NAME:-stack}
|
|
| 1065 |
- SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
|
|
| 1066 |
- USE_SCREEN=$(trueorfalse True $USE_SCREEN) |
|
| 1120 |
+ local service=$1 |
|
| 1121 |
+ local cmd=$2 |
|
| 1122 |
+ local screen_name=${SCREEN_NAME:-stack}
|
|
| 1123 |
+ local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1124 |
+ local service_dir="$status_dir/$screen_name" |
|
| 1125 |
+ local use_screen=$(trueorfalse True $USE_SCREEN) |
|
| 1126 |
+ local pid="$service_dir/$service.pid" |
|
| 1067 | 1127 |
|
| 1068 | 1128 |
if is_service_enabled $1; then |
| 1069 | 1129 |
# Append the service to the screen rc file |
| 1070 |
- screen_rc "$1" "$2" |
|
| 1071 |
- |
|
| 1072 |
- if [[ "$USE_SCREEN" = "True" ]]; then |
|
| 1073 |
- screen -S $SCREEN_NAME -X screen -t $1 |
|
| 1074 |
- |
|
| 1075 |
- if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1076 |
- screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
|
|
| 1077 |
- screen -S $SCREEN_NAME -p $1 -X log on |
|
| 1078 |
- ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
|
|
| 1079 |
- fi |
|
| 1080 |
- |
|
| 1081 |
- # sleep to allow bash to be ready to be send the command - we are |
|
| 1082 |
- # creating a new window in screen and then sends characters, so if |
|
| 1083 |
- # bash isn't running by the time we send the command, nothing happens |
|
| 1084 |
- sleep 3 |
|
| 1085 |
- |
|
| 1086 |
- NL=`echo -ne '\015'` |
|
| 1087 |
- # This fun command does the following: |
|
| 1088 |
- # - the passed server command is backgrounded |
|
| 1089 |
- # - the pid of the background process is saved in the usual place |
|
| 1090 |
- # - the server process is brought back to the foreground |
|
| 1091 |
- # - if the server process exits prematurely the fg command errors |
|
| 1092 |
- # and a message is written to stdout and the service failure file |
|
| 1093 |
- # The pid saved can be used in screen_stop() as a process group |
|
| 1094 |
- # id to kill off all child processes |
|
| 1095 |
- screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL" |
|
| 1130 |
+ screen_rc "$service" "$cmd" |
|
| 1131 |
+ |
|
| 1132 |
+ if [[ "$use_screen" = "True" ]]; then |
|
| 1133 |
+ screen -S $screen_name -X screen -t $service |
|
| 1134 |
+ |
|
| 1135 |
+ # this retry loop brought to you by slow compute nodes, screen raciness, |
|
| 1136 |
+ # and frustration in upgrading. |
|
| 1137 |
+ local screen_tries=0 |
|
| 1138 |
+ while [ "$screen_tries" -lt 10 ]; do |
|
| 1139 |
+ _start_in_screen "$service" "$cmd" |
|
| 1140 |
+ if _is_running_in_screen $service; then |
|
| 1141 |
+ screen_tries=10 |
|
| 1142 |
+ else |
|
| 1143 |
+ screen_tries=$[screen_tries + 1] |
|
| 1144 |
+ echo "Failed to start service after $screen_tries attempt(s), retrying" |
|
| 1145 |
+ if [[ "$screen_tries" -eq 10 ]]; then |
|
| 1146 |
+ echo "Too many retries, giving up" |
|
| 1147 |
+ exit 1 |
|
| 1148 |
+ fi |
|
| 1149 |
+ sleep 1 |
|
| 1150 |
+ fi |
|
| 1151 |
+ done |
|
| 1096 | 1152 |
else |
| 1097 | 1153 |
# Spawn directly without screen |
| 1098 |
- run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid |
|
| 1154 |
+ run_process "$service" "$cmd" >$pid |
|
| 1099 | 1155 |
fi |
| 1100 | 1156 |
fi |
| 1101 | 1157 |
} |