With cell v2, on initial bring up, discover hosts can't run unless all
the compute nodes have checked in. The documentation says that you
should run ``nova service-list --binary nova-compute`` and see all
your hosts before running discover hosts. This isn't really viable in
a multinode devstack because of how things are brought up in parts.
We can however know that stack.sh will not complete before the compute
node is up by waiting for the compute node to check in before
completing. This happens quite late in the stack.sh run, so shouldn't
add any extra time in most runs.
Cells v1 and Xenserver don't use real hostnames in the service table
(they encode complex data that is hostname like to provide more
topology information than just hostnames). They are exempted from this
check.
Related-Bug: #1708039
Change-Id: I32eb59b9d6c225a3e93992be3a3b9f4b251d7189
... | ... |
@@ -407,6 +407,26 @@ EOF |
407 | 407 |
return $rval |
408 | 408 |
} |
409 | 409 |
|
410 |
+function wait_for_compute { |
|
411 |
+ local timeout=$1 |
|
412 |
+ local rval=0 |
|
413 |
+ time_start "wait_for_service" |
|
414 |
+ timeout $timeout bash -x <<EOF || rval=$? |
|
415 |
+ ID="" |
|
416 |
+ while [[ "\$ID" == "" ]]; do |
|
417 |
+ sleep 1 |
|
418 |
+ ID=\$(openstack --os-cloud devstack-admin --os-region "$REGION_NAME" compute service list --host `hostname` --service nova-compute -c ID -f value) |
|
419 |
+ done |
|
420 |
+EOF |
|
421 |
+ time_stop "wait_for_service" |
|
422 |
+ # Figure out what's happening on platforms where this doesn't work |
|
423 |
+ if [[ "$rval" != 0 ]]; then |
|
424 |
+ echo "Didn't find service registered by hostname after $timeout seconds" |
|
425 |
+ openstack --os-cloud devstack-admin --os-region "$REGION_NAME" compute service list |
|
426 |
+ fi |
|
427 |
+ return $rval |
|
428 |
+} |
|
429 |
+ |
|
410 | 430 |
|
411 | 431 |
# ping check |
412 | 432 |
# Uses globals ``ENABLED_SERVICES``, ``TOP_DIR``, ``MULTI_HOST``, ``PRIVATE_NETWORK`` |
... | ... |
@@ -944,6 +944,28 @@ function start_nova_conductor { |
944 | 944 |
done |
945 | 945 |
} |
946 | 946 |
|
947 |
+function is_nova_ready { |
|
948 |
+ # NOTE(sdague): with cells v2 all the compute services must be up |
|
949 |
+ # and checked into the database before discover_hosts is run. This |
|
950 |
+ # happens in all in one installs by accident, because > 30 seconds |
|
951 |
+ # happen between here and the script ending. However, in multinode |
|
952 |
+ # tests this can very often not be the case. So ensure that the |
|
953 |
+ # compute is up before we move on. |
|
954 |
+ if is_service_enabled n-cell; then |
|
955 |
+ # cells v1 can't complete the check below because it munges |
|
956 |
+ # hostnames with cell information (grumble grumble). |
|
957 |
+ return |
|
958 |
+ fi |
|
959 |
+ # TODO(sdague): honestly, this probably should be a plug point for |
|
960 |
+ # an external system. |
|
961 |
+ if [[ "$VIRT_DRIVER" == 'xenserver' ]]; then |
|
962 |
+ # xenserver encodes information in the hostname of the compute |
|
963 |
+ # because of the dom0/domU split. Just ignore for now. |
|
964 |
+ return |
|
965 |
+ fi |
|
966 |
+ wait_for_compute 60 |
|
967 |
+} |
|
968 |
+ |
|
947 | 969 |
function start_nova { |
948 | 970 |
# this catches the cells v1 case early |
949 | 971 |
_set_singleconductor |
... | ... |
@@ -1433,6 +1433,13 @@ fi |
1433 | 1433 |
# Sanity checks |
1434 | 1434 |
# ============= |
1435 | 1435 |
|
1436 |
+# Check that computes are all ready |
|
1437 |
+# |
|
1438 |
+# TODO(sdague): there should be some generic phase here. |
|
1439 |
+if is_service_enabled n-cpu; then |
|
1440 |
+ is_nova_ready |
|
1441 |
+fi |
|
1442 |
+ |
|
1436 | 1443 |
# Check the status of running services |
1437 | 1444 |
service_check |
1438 | 1445 |
|