This reverts commit cd0fba6c3803bb79b37f09709378cf120a6824c6.
It appears that this isn't actually making things better, and
possibly increasing failure rates. Revert to go back to the
tried and true model of just sleeping a lot.
Change-Id: Ie630f72d91f16160fa12b3892e5c74378d244cea
| ... | ... |
@@ -1044,100 +1044,44 @@ function run_process {
|
| 1044 | 1044 |
echo $! |
| 1045 | 1045 |
} |
| 1046 | 1046 |
|
| 1047 |
-function _start_in_screen {
|
|
| 1048 |
- local service=$1 |
|
| 1049 |
- local cmd=$2 |
|
| 1050 |
- local screen_name=${SCREEN_NAME:-stack}
|
|
| 1051 |
- local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1052 |
- local service_dir="$status_dir/$screen_name" |
|
| 1053 |
- local pid="$service_dir/$service.pid" |
|
| 1054 |
- local failure="$service_dir/$service.failure" |
|
| 1055 |
- |
|
| 1056 |
- if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1057 |
- local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
|
| 1058 |
- local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
|
|
| 1059 |
- # this whole dance is done because of slow nodes |
|
| 1060 |
- screen -S $screen_name -p $service -X logfile ${logfile}
|
|
| 1061 |
- screen -S $screen_name -p $service -X log on |
|
| 1062 |
- ln -sf ${logfile} ${shortlog}
|
|
| 1063 |
- fi |
|
| 1064 |
- |
|
| 1065 |
- NL=`echo -ne '\015'` |
|
| 1066 |
- # This fun command does the following: |
|
| 1067 |
- # - the passed server command is backgrounded |
|
| 1068 |
- # - the pid of the background process is saved in the usual place |
|
| 1069 |
- # - the server process is brought back to the foreground |
|
| 1070 |
- # - if the server process exits prematurely the fg command errors |
|
| 1071 |
- # and a message is written to stdout and the service failure file |
|
| 1072 |
- # The pid saved can be used in screen_stop() as a process group |
|
| 1073 |
- # id to kill off all child processes |
|
| 1074 |
- echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL" |
|
| 1075 |
- screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL" |
|
| 1076 |
-} |
|
| 1077 |
- |
|
| 1078 |
- |
|
| 1079 |
-function _is_running_in_screen {
|
|
| 1080 |
- local service=$1 |
|
| 1081 |
- local screen_name=${SCREEN_NAME:-stack}
|
|
| 1082 |
- local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1083 |
- local service_dir="$status_dir/$screen_name" |
|
| 1084 |
- local pid="$service_dir/$service.pid" |
|
| 1085 |
- local failure="$service_dir/$service.failure" |
|
| 1086 |
- if [[ ! -e "$pid" && ! -e "$failure" ]]; then |
|
| 1087 |
- # if we don't have a pid or a failure for why, the command may not |
|
| 1088 |
- # have stuffed in there |
|
| 1089 |
- echo "Warning: neither $pid nor $failure exist, $service didn't seem to start" |
|
| 1090 |
- return 1 |
|
| 1091 |
- fi |
|
| 1092 |
- if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1093 |
- # if we should be logging, but we don't have a log file, something is wrong |
|
| 1094 |
- local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
|
|
| 1095 |
- if [[ ! -e "$logfile" ]]; then |
|
| 1096 |
- echo "Warning: expected logfile $logfile not found, something wrong with starting $service" |
|
| 1097 |
- return 1 |
|
| 1098 |
- fi |
|
| 1099 |
- fi |
|
| 1100 |
- return 0 |
|
| 1101 |
-} |
|
| 1102 |
- |
|
| 1103 | 1047 |
# Helper to launch a service in a named screen |
| 1104 | 1048 |
# screen_it service "command-line" |
| 1105 | 1049 |
function screen_it {
|
| 1106 |
- local service=$1 |
|
| 1107 |
- local cmd=$2 |
|
| 1108 |
- local screen_name=${SCREEN_NAME:-stack}
|
|
| 1109 |
- local status_dir=${SERVICE_DIR:-${DEST}/status}
|
|
| 1110 |
- local service_dir="$status_dir/$screen_name" |
|
| 1111 |
- local use_screen=$(trueorfalse True $USE_SCREEN) |
|
| 1112 |
- local pid="$service_dir/$service.pid" |
|
| 1050 |
+ SCREEN_NAME=${SCREEN_NAME:-stack}
|
|
| 1051 |
+ SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
|
|
| 1052 |
+ USE_SCREEN=$(trueorfalse True $USE_SCREEN) |
|
| 1113 | 1053 |
|
| 1114 | 1054 |
if is_service_enabled $1; then |
| 1115 | 1055 |
# Append the service to the screen rc file |
| 1116 |
- screen_rc "$service" "$cmd" |
|
| 1117 |
- |
|
| 1118 |
- if [[ "$use_screen" = "True" ]]; then |
|
| 1119 |
- screen -S $screen_name -X screen -t $service |
|
| 1120 |
- |
|
| 1121 |
- # this retry loop brought to you by slow compute nodes, screen raciness, |
|
| 1122 |
- # and frustration in upgrading. |
|
| 1123 |
- local screen_tries=0 |
|
| 1124 |
- while [ "$screen_tries" -lt 10 ]; do |
|
| 1125 |
- _start_in_screen "$service" "$cmd" |
|
| 1126 |
- if _is_running_in_screen $service; then |
|
| 1127 |
- screen_tries=10 |
|
| 1128 |
- else |
|
| 1129 |
- screen_tries=$[screen_tries + 1] |
|
| 1130 |
- echo "Failed to start service after $screen_tries attempt(s), retrying" |
|
| 1131 |
- if [[ "$screen_tries" -eq 10 ]]; then |
|
| 1132 |
- echo "Too many retries, giving up" |
|
| 1133 |
- exit 1 |
|
| 1134 |
- fi |
|
| 1135 |
- sleep 1 |
|
| 1136 |
- fi |
|
| 1137 |
- done |
|
| 1056 |
+ screen_rc "$1" "$2" |
|
| 1057 |
+ |
|
| 1058 |
+ if [[ "$USE_SCREEN" = "True" ]]; then |
|
| 1059 |
+ screen -S $SCREEN_NAME -X screen -t $1 |
|
| 1060 |
+ |
|
| 1061 |
+ if [[ -n ${SCREEN_LOGDIR} ]]; then
|
|
| 1062 |
+ screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
|
|
| 1063 |
+ screen -S $SCREEN_NAME -p $1 -X log on |
|
| 1064 |
+ ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
|
|
| 1065 |
+ fi |
|
| 1066 |
+ |
|
| 1067 |
+ # sleep to allow bash to be ready to be send the command - we are |
|
| 1068 |
+ # creating a new window in screen and then sends characters, so if |
|
| 1069 |
+ # bash isn't running by the time we send the command, nothing happens |
|
| 1070 |
+ sleep 3 |
|
| 1071 |
+ |
|
| 1072 |
+ NL=`echo -ne '\015'` |
|
| 1073 |
+ # This fun command does the following: |
|
| 1074 |
+ # - the passed server command is backgrounded |
|
| 1075 |
+ # - the pid of the background process is saved in the usual place |
|
| 1076 |
+ # - the server process is brought back to the foreground |
|
| 1077 |
+ # - if the server process exits prematurely the fg command errors |
|
| 1078 |
+ # and a message is written to stdout and the service failure file |
|
| 1079 |
+ # The pid saved can be used in screen_stop() as a process group |
|
| 1080 |
+ # id to kill off all child processes |
|
| 1081 |
+ screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL" |
|
| 1138 | 1082 |
else |
| 1139 | 1083 |
# Spawn directly without screen |
| 1140 |
- run_process "$service" "$cmd" >$pid |
|
| 1084 |
+ run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid |
|
| 1141 | 1085 |
fi |
| 1142 | 1086 |
fi |
| 1143 | 1087 |
} |