Coverage for nova/conductor/manager.py: 89%
925 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
1# Copyright 2013 IBM Corp.
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may
4# not use this file except in compliance with the License. You may obtain
5# a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations
13# under the License.
15"""Handles database requests from other nova services."""
17import collections
18import contextlib
19import copy
20import eventlet
21import functools
22import sys
23import typing as ty
25from keystoneauth1 import exceptions as ks_exc
26from oslo_config import cfg
27from oslo_db import exception as db_exc
28from oslo_limit import exception as limit_exceptions
29from oslo_log import log as logging
30import oslo_messaging as messaging
31from oslo_serialization import jsonutils
32from oslo_utils import excutils
33from oslo_utils import timeutils
34from oslo_utils import versionutils
36from nova.accelerator import cyborg
37from nova import availability_zones
38from nova.compute import instance_actions
39from nova.compute import rpcapi as compute_rpcapi
40from nova.compute import task_states
41from nova.compute import utils as compute_utils
42from nova.compute.utils import wrap_instance_event
43from nova.compute import vm_states
44from nova.conductor.tasks import cross_cell_migrate
45from nova.conductor.tasks import live_migrate
46from nova.conductor.tasks import migrate
47from nova import context as nova_context
48from nova import exception
49from nova.i18n import _
50from nova.image import glance
51from nova.limit import placement as placement_limits
52from nova.limit import utils as limit_utils
53from nova import manager
54from nova.network import neutron
55from nova import notifications
56from nova import objects
57from nova.objects import base as nova_object
58from nova.objects import fields
59from nova import profiler
60from nova import rpc
61from nova.scheduler.client import query
62from nova.scheduler.client import report
63from nova.scheduler import utils as scheduler_utils
64from nova import servicegroup
65from nova import utils
66from nova.volume import cinder
68LOG = logging.getLogger(__name__)
69CONF = cfg.CONF
72def targets_cell(fn):
73 """Wrap a method and automatically target the instance's cell.
75 This decorates a method with signature func(self, context, instance, ...)
76 and automatically targets the context with the instance's cell
77 mapping. It does this by looking up the InstanceMapping.
78 """
79 @functools.wraps(fn)
80 def wrapper(self, context, *args, **kwargs):
81 instance = kwargs.get('instance') or args[0]
82 try:
83 im = objects.InstanceMapping.get_by_instance_uuid(
84 context, instance.uuid)
85 except exception.InstanceMappingNotFound:
86 LOG.error('InstanceMapping not found, unable to target cell',
87 instance=instance)
88 except db_exc.CantStartEngineError:
89 # Check to see if we can ignore API DB connection failures
90 # because we might already be in the cell conductor.
91 with excutils.save_and_reraise_exception() as err_ctxt:
92 if CONF.api_database.connection is None:
93 err_ctxt.reraise = False
94 else:
95 LOG.debug('Targeting cell %(cell)s for conductor method %(meth)s',
96 {'cell': im.cell_mapping.identity,
97 'meth': fn.__name__})
98 # NOTE(danms): Target our context to the cell for the rest of
99 # this request, so that none of the subsequent code needs to
100 # care about it.
101 nova_context.set_target_cell(context, im.cell_mapping)
102 return fn(self, context, *args, **kwargs)
103 return wrapper
106class ConductorManager(manager.Manager):
107 """Mission: Conduct things.
109 The methods in the base API for nova-conductor are various proxy operations
110 performed on behalf of the nova-compute service running on compute nodes.
111 Compute nodes are not allowed to directly access the database, so this set
112 of methods allows them to get specific work done without locally accessing
113 the database.
115 The nova-conductor service also exposes an API in the 'compute_task'
116 namespace. See the ComputeTaskManager class for details.
117 """
119 target = messaging.Target(version='3.0')
121 def __init__(self, *args, **kwargs):
122 super(ConductorManager, self).__init__(service_name='conductor',
123 *args, **kwargs)
124 self.compute_task_mgr = ComputeTaskManager()
125 self.additional_endpoints.append(self.compute_task_mgr)
127 # NOTE(hanlind): This can be removed in version 4.0 of the RPC API
128 def provider_fw_rule_get_all(self, context):
129 # NOTE(hanlind): Simulate an empty db result for compat reasons.
130 return []
132 def _object_dispatch(self, target, method, args, kwargs):
133 """Dispatch a call to an object method.
135 This ensures that object methods get called and any exception
136 that is raised gets wrapped in an ExpectedException for forwarding
137 back to the caller (without spamming the conductor logs).
138 """
139 try:
140 # NOTE(danms): Keep the getattr inside the try block since
141 # a missing method is really a client problem
142 return getattr(target, method)(*args, **kwargs)
143 except Exception:
144 raise messaging.ExpectedException()
146 def object_class_action_versions(self, context, objname, objmethod,
147 object_versions, args, kwargs):
148 objclass = nova_object.NovaObject.obj_class_from_name(
149 objname, object_versions[objname])
150 args = tuple([context] + list(args))
151 result = self._object_dispatch(objclass, objmethod, args, kwargs)
152 # NOTE(danms): The RPC layer will convert to primitives for us,
153 # but in this case, we need to honor the version the client is
154 # asking for, so we do it before returning here.
155 # NOTE(hanlind): Do not convert older than requested objects,
156 # see bug #1596119.
157 if isinstance(result, nova_object.NovaObject):
158 target_version = object_versions[objname]
159 requested_version = versionutils.convert_version_to_tuple(
160 target_version)
161 actual_version = versionutils.convert_version_to_tuple(
162 result.VERSION)
163 do_backport = requested_version < actual_version
164 other_major_version = requested_version[0] != actual_version[0]
165 if do_backport or other_major_version:
166 result = result.obj_to_primitive(
167 target_version=target_version,
168 version_manifest=object_versions)
169 return result
171 def object_action(self, context, objinst, objmethod, args, kwargs):
172 """Perform an action on an object."""
173 oldobj = objinst.obj_clone()
174 result = self._object_dispatch(objinst, objmethod, args, kwargs)
175 updates = dict()
176 # NOTE(danms): Diff the object with the one passed to us and
177 # generate a list of changes to forward back
178 for name, field in objinst.fields.items():
179 if not objinst.obj_attr_is_set(name): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was never true
180 # Avoid demand-loading anything
181 continue
182 if (not oldobj.obj_attr_is_set(name) or 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true
183 getattr(oldobj, name) != getattr(objinst, name)):
184 updates[name] = field.to_primitive(objinst, name,
185 getattr(objinst, name))
186 # This is safe since a field named this would conflict with the
187 # method anyway
188 updates['obj_what_changed'] = objinst.obj_what_changed()
189 return updates, result
191 def object_backport_versions(self, context, objinst, object_versions):
192 target = object_versions[objinst.obj_name()]
193 LOG.debug('Backporting %(obj)s to %(ver)s with versions %(manifest)s',
194 {'obj': objinst.obj_name(),
195 'ver': target,
196 'manifest': ','.join(
197 ['%s=%s' % (name, ver)
198 for name, ver in object_versions.items()])})
199 return objinst.obj_to_primitive(target_version=target,
200 version_manifest=object_versions)
202 def reset(self):
203 objects.Service.clear_min_version_cache()
206@contextlib.contextmanager
207def try_target_cell(context, cell):
208 """If cell is not None call func with context.target_cell.
210 This is a method to help during the transition period. Currently
211 various mappings may not exist if a deployment has not migrated to
212 cellsv2. If there is no mapping call the func as normal, otherwise
213 call it in a target_cell context.
214 """
215 if cell: 215 ↛ 219line 215 didn't jump to line 219 because the condition on line 215 was always true
216 with nova_context.target_cell(context, cell) as cell_context:
217 yield cell_context
218 else:
219 yield context
222@contextlib.contextmanager
223def obj_target_cell(obj, cell):
224 """Run with object's context set to a specific cell"""
225 with try_target_cell(obj._context, cell) as target:
226 with obj.obj_alternate_context(target):
227 yield target
230@profiler.trace_cls("rpc")
231class ComputeTaskManager:
232 """Namespace for compute methods.
234 This class presents an rpc API for nova-conductor under the 'compute_task'
235 namespace. The methods here are compute operations that are invoked
236 by the API service. These methods see the operation to completion, which
237 may involve coordinating activities on multiple compute nodes.
238 """
240 target = messaging.Target(namespace='compute_task', version='1.25')
242 def __init__(self):
243 self.compute_rpcapi = compute_rpcapi.ComputeAPI()
244 self.volume_api = cinder.API()
245 self.image_api = glance.API()
246 self.network_api = neutron.API()
247 self.servicegroup_api = servicegroup.API()
248 self.query_client = query.SchedulerQueryClient()
249 self.notifier = rpc.get_notifier('compute')
250 # Help us to record host in EventReporter
251 self.host = CONF.host
253 try:
254 # Test our placement client during initialization
255 self.report_client
256 except (ks_exc.EndpointNotFound,
257 ks_exc.DiscoveryFailure,
258 ks_exc.RequestTimeout,
259 ks_exc.GatewayTimeout,
260 ks_exc.ConnectFailure) as e:
261 # Non-fatal, likely transient (although not definitely);
262 # continue startup but log the warning so that when things
263 # fail later, it will be clear why we can not do certain
264 # things.
265 LOG.warning('Unable to initialize placement client (%s); '
266 'Continuing with startup, but some operations '
267 'will not be possible.', e)
268 except (ks_exc.MissingAuthPlugin,
269 ks_exc.Unauthorized) as e:
270 # This is almost definitely fatal mis-configuration. The
271 # Unauthorized error might be transient, but it is
272 # probably reasonable to consider it fatal.
273 LOG.error('Fatal error initializing placement client; '
274 'config is incorrect or incomplete: %s', e)
275 raise
276 except Exception as e:
277 # Unknown/unexpected errors here are fatal
278 LOG.error('Fatal error initializing placement client: %s', e)
279 raise
281 @property
282 def report_client(self):
283 return report.report_client_singleton()
285 def reset(self):
286 LOG.info('Reloading compute RPC API')
287 compute_rpcapi.LAST_VERSION = None
288 self.compute_rpcapi = compute_rpcapi.ComputeAPI()
290 # TODO(tdurakov): remove `live` parameter here on compute task api RPC
291 # version bump to 2.x
292 # TODO(danms): remove the `reservations` parameter here on compute task api
293 # RPC version bump to 2.x
294 @messaging.expected_exceptions(
295 exception.NoValidHost,
296 exception.ComputeServiceUnavailable,
297 exception.ComputeHostNotFound,
298 exception.InvalidHypervisorType,
299 exception.InvalidCPUInfo,
300 exception.UnableToMigrateToSelf,
301 exception.DestinationHypervisorTooOld,
302 exception.InvalidLocalStorage,
303 exception.InvalidSharedStorage,
304 exception.HypervisorUnavailable,
305 exception.InstanceInvalidState,
306 exception.MigrationPreCheckError,
307 exception.UnsupportedPolicyException)
308 @targets_cell
309 @wrap_instance_event(prefix='conductor')
310 def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
311 flavor, block_migration, disk_over_commit, reservations=None,
312 clean_shutdown=True, request_spec=None, host_list=None):
313 if instance and not isinstance(instance, nova_object.NovaObject): 313 ↛ 316line 313 didn't jump to line 316 because the condition on line 313 was never true
314 # NOTE(danms): Until v2 of the RPC API, we need to tolerate
315 # old-world instance objects here
316 attrs = ['metadata', 'system_metadata', 'info_cache',
317 'security_groups']
318 instance = objects.Instance._from_db_object(
319 context, objects.Instance(), instance,
320 expected_attrs=attrs)
321 # NOTE: Remove this when we drop support for v1 of the RPC API
322 if flavor and not isinstance(flavor, objects.Flavor): 322 ↛ 325line 322 didn't jump to line 325 because the condition on line 322 was never true
323 # Code downstream may expect extra_specs to be populated since it
324 # is receiving an object, so lookup the flavor to ensure this.
325 flavor = objects.Flavor.get_by_id(context, flavor['id'])
326 if live and not rebuild and not flavor:
327 self._live_migrate(context, instance, scheduler_hint,
328 block_migration, disk_over_commit, request_spec)
329 elif not live and not rebuild and flavor:
330 instance_uuid = instance.uuid
331 with compute_utils.EventReporter(context, 'cold_migrate',
332 self.host, instance_uuid):
333 self._cold_migrate(context, instance, flavor,
334 scheduler_hint['filter_properties'],
335 clean_shutdown, request_spec,
336 host_list)
337 else:
338 raise NotImplementedError()
340 @staticmethod
341 def _get_request_spec_for_cold_migrate(context, instance, flavor,
342 filter_properties, request_spec):
343 # NOTE(sbauza): If a reschedule occurs when prep_resize(), then
344 # it only provides filter_properties legacy dict back to the
345 # conductor with no RequestSpec part of the payload for <Stein
346 # computes.
347 # TODO(mriedem): We can remove this compat code for no request spec
348 # coming to conductor in ComputeTaskAPI RPC API version 2.0
349 if not request_spec:
350 image_meta = utils.get_image_from_system_metadata(
351 instance.system_metadata)
352 # Make sure we hydrate a new RequestSpec object with the new flavor
353 # and not the nested one from the instance
354 request_spec = objects.RequestSpec.from_components(
355 context, instance.uuid, image_meta,
356 flavor, instance.numa_topology, instance.pci_requests,
357 filter_properties, None, instance.availability_zone,
358 project_id=instance.project_id, user_id=instance.user_id)
359 elif not isinstance(request_spec, objects.RequestSpec):
360 # Prior to compute RPC API 5.1 conductor would pass a legacy dict
361 # version of the request spec to compute and Stein compute
362 # could be sending that back to conductor on reschedule, so if we
363 # got a dict convert it to an object.
364 # TODO(mriedem): We can drop this compat code when we only support
365 # compute RPC API >=6.0.
366 request_spec = objects.RequestSpec.from_primitives(
367 context, request_spec, filter_properties)
368 # We don't have to set the new flavor on the request spec because
369 # if we got here it was due to a reschedule from the compute and
370 # the request spec would already have the new flavor in it from the
371 # else block below.
372 else:
373 # NOTE(sbauza): Resizes means new flavor, so we need to update the
374 # original RequestSpec object for make sure the scheduler verifies
375 # the right one and not the original flavor
376 request_spec.flavor = flavor
377 return request_spec
379 def _cold_migrate(self, context, instance, flavor, filter_properties,
380 clean_shutdown, request_spec, host_list):
381 request_spec = self._get_request_spec_for_cold_migrate(
382 context, instance, flavor, filter_properties, request_spec)
384 task = self._build_cold_migrate_task(context, instance, flavor,
385 request_spec, clean_shutdown, host_list)
386 try:
387 task.execute()
388 except exception.NoValidHost as ex:
389 vm_state = instance.vm_state
390 if not vm_state: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true
391 vm_state = vm_states.ACTIVE
392 updates = {'vm_state': vm_state, 'task_state': None}
393 self._set_vm_state_and_notify(context, instance.uuid,
394 'migrate_server',
395 updates, ex, request_spec)
397 # if the flavor IDs match, it's migrate; otherwise resize
398 if flavor.id == instance.instance_type_id:
399 msg = _("No valid host found for cold migrate")
400 else:
401 msg = _("No valid host found for resize")
402 raise exception.NoValidHost(reason=msg)
403 except exception.UnsupportedPolicyException as ex:
404 with excutils.save_and_reraise_exception():
405 vm_state = instance.vm_state
406 if not vm_state: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true
407 vm_state = vm_states.ACTIVE
408 updates = {'vm_state': vm_state, 'task_state': None}
409 self._set_vm_state_and_notify(context, instance.uuid,
410 'migrate_server',
411 updates, ex, request_spec)
412 except Exception as ex:
413 with excutils.save_and_reraise_exception():
414 # Refresh the instance so we don't overwrite vm_state changes
415 # set after we executed the task.
416 try:
417 instance.refresh()
418 # Passing vm_state is kind of silly but it's expected in
419 # set_vm_state_and_notify.
420 updates = {'vm_state': instance.vm_state,
421 'task_state': None}
422 self._set_vm_state_and_notify(context, instance.uuid,
423 'migrate_server',
424 updates, ex, request_spec)
425 except exception.InstanceNotFound:
426 # We can't send the notification because the instance is
427 # gone so just log it.
428 LOG.info('During %s the instance was deleted.',
429 'resize' if instance.instance_type_id != flavor.id
430 else 'cold migrate', instance=instance)
431 # NOTE(sbauza): Make sure we persist the new flavor in case we had
432 # a successful scheduler call if and only if nothing bad happened
433 if request_spec.obj_what_changed(): 433 ↛ exitline 433 didn't return from function '_cold_migrate' because the condition on line 433 was always true
434 request_spec.save()
436 def _set_vm_state_and_notify(self, context, instance_uuid, method, updates,
437 ex, request_spec):
438 scheduler_utils.set_vm_state_and_notify(
439 context, instance_uuid, 'compute_task', method, updates,
440 ex, request_spec)
442 def _cleanup_allocated_networks(
443 self, context, instance, requested_networks):
444 try:
445 # If we were told not to allocate networks let's save ourselves
446 # the trouble of calling the network API.
447 if not (requested_networks and requested_networks.no_allocate):
448 self.network_api.deallocate_for_instance(
449 context, instance, requested_networks=requested_networks)
450 except Exception:
451 LOG.exception('Failed to deallocate networks', instance=instance)
452 return
454 instance.system_metadata['network_allocated'] = 'False'
455 try:
456 instance.save()
457 except exception.InstanceNotFound:
458 # NOTE: It's possible that we're cleaning up the networks
459 # because the instance was deleted. If that's the case then this
460 # exception will be raised by instance.save()
461 pass
463 @targets_cell
464 @wrap_instance_event(prefix='conductor')
465 def live_migrate_instance(self, context, instance, scheduler_hint,
466 block_migration, disk_over_commit, request_spec):
467 self._live_migrate(context, instance, scheduler_hint,
468 block_migration, disk_over_commit, request_spec)
470 def _live_migrate(self, context, instance, scheduler_hint,
471 block_migration, disk_over_commit, request_spec):
472 destination = scheduler_hint.get("host")
474 def _set_vm_state(context, instance, ex, vm_state=None,
475 task_state=None):
476 request_spec = {'instance_properties': {
477 'uuid': instance.uuid, },
478 }
479 scheduler_utils.set_vm_state_and_notify(context,
480 instance.uuid,
481 'compute_task', 'migrate_server',
482 dict(vm_state=vm_state,
483 task_state=task_state,
484 expected_task_state=task_states.MIGRATING,),
485 ex, request_spec)
487 migration = objects.Migration(context=context.elevated())
488 migration.dest_compute = destination
489 migration.status = 'accepted'
490 migration.instance_uuid = instance.uuid
491 migration.source_compute = instance.host
492 migration.migration_type = fields.MigrationType.LIVE_MIGRATION
493 if instance.obj_attr_is_set('flavor'):
494 migration.old_instance_type_id = instance.flavor.id
495 migration.new_instance_type_id = instance.flavor.id
496 else:
497 migration.old_instance_type_id = instance.instance_type_id
498 migration.new_instance_type_id = instance.instance_type_id
499 migration.create()
501 task = self._build_live_migrate_task(context, instance, destination,
502 block_migration, disk_over_commit,
503 migration, request_spec)
504 try:
505 task.execute()
506 except (exception.NoValidHost,
507 exception.ComputeHostNotFound,
508 exception.ComputeServiceUnavailable,
509 exception.InvalidHypervisorType,
510 exception.InvalidCPUInfo,
511 exception.UnableToMigrateToSelf,
512 exception.DestinationHypervisorTooOld,
513 exception.InvalidLocalStorage,
514 exception.InvalidSharedStorage,
515 exception.HypervisorUnavailable,
516 exception.InstanceInvalidState,
517 exception.MigrationPreCheckError,
518 exception.MigrationSchedulerRPCError) as ex:
519 with excutils.save_and_reraise_exception():
520 _set_vm_state(context, instance, ex, instance.vm_state)
521 migration.status = 'error'
522 migration.save()
523 except Exception as ex:
524 LOG.error('Migration of instance %(instance_id)s to host'
525 ' %(dest)s unexpectedly failed.',
526 {'instance_id': instance.uuid, 'dest': destination},
527 exc_info=True)
528 # Reset the task state to None to indicate completion of
529 # the operation as it is done in case of known exceptions.
530 _set_vm_state(context, instance, ex, vm_states.ERROR,
531 task_state=None)
532 migration.status = 'error'
533 migration.save()
534 raise exception.MigrationError(reason=str(ex))
536 def _build_live_migrate_task(self, context, instance, destination,
537 block_migration, disk_over_commit, migration,
538 request_spec=None):
539 return live_migrate.LiveMigrationTask(context, instance,
540 destination, block_migration,
541 disk_over_commit, migration,
542 self.compute_rpcapi,
543 self.servicegroup_api,
544 self.query_client,
545 self.report_client,
546 request_spec)
548 def _build_cold_migrate_task(self, context, instance, flavor, request_spec,
549 clean_shutdown, host_list):
550 return migrate.MigrationTask(context, instance, flavor,
551 request_spec, clean_shutdown,
552 self.compute_rpcapi,
553 self.query_client, self.report_client,
554 host_list, self.network_api)
556 def _destroy_build_request(self, context, instance):
557 # The BuildRequest needs to be stored until the instance is mapped to
558 # an instance table. At that point it will never be used again and
559 # should be deleted.
560 build_request = objects.BuildRequest.get_by_instance_uuid(
561 context, instance.uuid)
562 # TODO(alaski): Sync API updates of the build_request to the
563 # instance before it is destroyed. Right now only locked_by can
564 # be updated before this is destroyed.
565 build_request.destroy()
567 def _populate_instance_mapping(self, context, instance, host):
568 try:
569 inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
570 context, instance.uuid)
571 except exception.InstanceMappingNotFound:
572 # NOTE(alaski): If nova-api is up to date this exception should
573 # never be hit. But during an upgrade it's possible that an old
574 # nova-api didn't create an instance_mapping during this boot
575 # request.
576 LOG.debug('Instance was not mapped to a cell, likely due '
577 'to an older nova-api service running.',
578 instance=instance)
579 return None
580 else:
581 try:
582 host_mapping = objects.HostMapping.get_by_host(context,
583 host.service_host)
584 except exception.HostMappingNotFound:
585 # NOTE(alaski): For now this exception means that a
586 # deployment has not migrated to cellsv2 and we should
587 # remove the instance_mapping that has been created.
588 # Eventually this will indicate a failure to properly map a
589 # host to a cell and we may want to reschedule.
590 inst_mapping.destroy()
591 return None
592 else:
593 inst_mapping.cell_mapping = host_mapping.cell_mapping
594 inst_mapping.save()
595 return inst_mapping
597 def _validate_existing_attachment_ids(self, context, instance, bdms):
598 """Ensure any attachment ids referenced by the bdms exist.
600 New attachments will only be created if the attachment ids referenced
601 by the bdms no longer exist. This can happen when an instance is
602 rescheduled after a failure to spawn as cleanup code on the previous
603 host will delete attachments before rescheduling.
604 """
605 for bdm in bdms:
606 if bdm.is_volume and bdm.attachment_id: 606 ↛ 605line 606 didn't jump to line 605 because the condition on line 606 was always true
607 try:
608 self.volume_api.attachment_get(context, bdm.attachment_id)
609 except exception.VolumeAttachmentNotFound:
610 attachment = self.volume_api.attachment_create(
611 context, bdm.volume_id, instance.uuid)
612 bdm.attachment_id = attachment['id']
613 bdm.save()
615 def _cleanup_when_reschedule_fails(
616 self, context, instance, exception, legacy_request_spec,
617 requested_networks):
618 """Set the instance state and clean up.
620 It is only used in case build_instance fails while rescheduling the
621 instance
622 """
624 updates = {'vm_state': vm_states.ERROR,
625 'task_state': None}
626 self._set_vm_state_and_notify(
627 context, instance.uuid, 'build_instances', updates, exception,
628 legacy_request_spec)
629 self._cleanup_allocated_networks(
630 context, instance, requested_networks)
632 arq_uuids = None
633 # arqs have not bound to port/instance yet
634 if requested_networks:
635 arq_uuids = [req.arq_uuid
636 for req in requested_networks if req.arq_uuid]
637 compute_utils.delete_arqs_if_needed(context, instance, arq_uuids)
639 # NOTE(danms): This is never cell-targeted because it is only used for
640 # n-cpu reschedules which go to the cell conductor and thus are always
641 # cell-specific.
642 def build_instances(self, context, instances, image, filter_properties,
643 admin_password, injected_files, requested_networks,
644 security_groups, block_device_mapping=None, legacy_bdm=True,
645 request_spec=None, host_lists=None):
646 # TODO(ndipanov): Remove block_device_mapping and legacy_bdm in version
647 # 2.0 of the RPC API.
648 # TODO(danms): Remove this in version 2.0 of the RPC API
649 if (requested_networks and 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was never true
650 not isinstance(requested_networks,
651 objects.NetworkRequestList)):
652 requested_networks = objects.NetworkRequestList.from_tuples(
653 requested_networks)
654 # TODO(melwitt): Remove this in version 2.0 of the RPC API
655 flavor = filter_properties.get('instance_type')
656 if flavor and not isinstance(flavor, objects.Flavor): 656 ↛ 659line 656 didn't jump to line 659 because the condition on line 656 was never true
657 # Code downstream may expect extra_specs to be populated since it
658 # is receiving an object, so lookup the flavor to ensure this.
659 flavor = objects.Flavor.get_by_id(context, flavor['id'])
660 filter_properties = dict(filter_properties, instance_type=flavor)
662 # Older computes will not send a request_spec during reschedules so we
663 # need to check and build our own if one is not provided.
664 if request_spec is None:
665 legacy_request_spec = scheduler_utils.build_request_spec(
666 image, instances)
667 else:
668 # TODO(mriedem): This is annoying but to populate the local
669 # request spec below using the filter_properties, we have to pass
670 # in a primitive version of the request spec. Yes it's inefficient
671 # and we can remove it once the populate_retry and
672 # populate_filter_properties utility methods are converted to
673 # work on a RequestSpec object rather than filter_properties.
674 # NOTE(gibi): we have to keep a reference to the original
675 # RequestSpec object passed to this function as we lose information
676 # during the below legacy conversion
677 legacy_request_spec = request_spec.to_legacy_request_spec_dict()
679 # 'host_lists' will be None during a reschedule from a pre-Queens
680 # compute. In all other cases, it will be a list of lists, though the
681 # lists may be empty if there are no more hosts left in a rescheduling
682 # situation.
683 is_reschedule = host_lists is not None
684 try:
685 # check retry policy. Rather ugly use of instances[0]...
686 # but if we've exceeded max retries... then we really only
687 # have a single instance.
688 # TODO(sbauza): Provide directly the RequestSpec object
689 # when populate_retry() accepts it
690 scheduler_utils.populate_retry(
691 filter_properties, instances[0].uuid)
692 instance_uuids = [instance.uuid for instance in instances]
693 spec_obj = objects.RequestSpec.from_primitives(
694 context, legacy_request_spec, filter_properties)
695 LOG.debug("Rescheduling: %s", is_reschedule)
696 if is_reschedule:
697 # Make sure that we have a host, as we may have exhausted all
698 # our alternates
699 if not host_lists[0]: 699 ↛ 702line 699 didn't jump to line 702 because the condition on line 699 was never true
700 # We have an empty list of hosts, so this instance has
701 # failed to build.
702 msg = ("Exhausted all hosts available for retrying build "
703 "failures for instance %(instance_uuid)s." %
704 {"instance_uuid": instances[0].uuid})
705 raise exception.MaxRetriesExceeded(reason=msg)
706 else:
707 # This is not a reschedule, so we need to call the scheduler to
708 # get appropriate hosts for the request.
709 # NOTE(gibi): We only call the scheduler if we are rescheduling
710 # from a really old compute. In that case we do not support
711 # externally-defined resource requests, like port QoS. So no
712 # requested_resources are set on the RequestSpec here.
713 host_lists = self._schedule_instances(context, spec_obj,
714 instance_uuids, return_alternates=True)
715 except Exception as exc:
716 # NOTE(mriedem): If we're rescheduling from a failed build on a
717 # compute, "retry" will be set and num_attempts will be >1 because
718 # populate_retry above will increment it. If the server build was
719 # forced onto a host/node or [scheduler]/max_attempts=1, "retry"
720 # won't be in filter_properties and we won't get here because
721 # nova-compute will just abort the build since reschedules are
722 # disabled in those cases.
723 num_attempts = filter_properties.get(
724 'retry', {}).get('num_attempts', 1)
725 for instance in instances:
726 # If num_attempts > 1, we're in a reschedule and probably
727 # either hit NoValidHost or MaxRetriesExceeded. Either way,
728 # the build request should already be gone and we probably
729 # can't reach the API DB from the cell conductor.
730 if num_attempts <= 1:
731 try:
732 # If the BuildRequest stays around then instance
733 # show/lists will pull from it rather than the errored
734 # instance.
735 self._destroy_build_request(context, instance)
736 except exception.BuildRequestNotFound:
737 pass
738 self._cleanup_when_reschedule_fails(
739 context, instance, exc, legacy_request_spec,
740 requested_networks)
741 return
743 elevated = context.elevated()
744 for (instance, host_list) in zip(instances, host_lists):
745 host = host_list.pop(0)
746 if is_reschedule:
747 # If this runs in the superconductor, the first instance will
748 # already have its resources claimed in placement. If this is a
749 # retry, though, this is running in the cell conductor, and we
750 # need to claim first to ensure that the alternate host still
751 # has its resources available. Note that there are schedulers
752 # that don't support Placement, so must assume that the host is
753 # still available.
754 host_available = False
755 while host and not host_available:
756 if host.allocation_request: 756 ↛ 759line 756 didn't jump to line 759 because the condition on line 756 was always true
757 alloc_req = jsonutils.loads(host.allocation_request)
758 else:
759 alloc_req = None
760 if alloc_req: 760 ↛ 784line 760 didn't jump to line 784 because the condition on line 760 was always true
761 try:
762 host_available = scheduler_utils.claim_resources(
763 elevated, self.report_client, spec_obj,
764 instance.uuid, alloc_req,
765 host.allocation_request_version)
766 if request_spec and host_available:
767 # NOTE(gibi): redo the request group - resource
768 # provider mapping as the above claim call
769 # moves the allocation of the instance to
770 # another host
771 scheduler_utils.fill_provider_mapping(
772 request_spec, host)
773 except Exception as exc:
774 self._cleanup_when_reschedule_fails(
775 context, instance, exc, legacy_request_spec,
776 requested_networks)
777 return
778 else:
779 # Some deployments use different schedulers that do not
780 # use Placement, so they will not have an
781 # allocation_request to claim with. For those cases,
782 # there is no concept of claiming, so just assume that
783 # the host is valid.
784 host_available = True
785 if not host_available:
786 # Insufficient resources remain on that host, so
787 # discard it and try the next.
788 host = host_list.pop(0) if host_list else None
789 if not host_available:
790 # No more available hosts for retrying the build.
791 msg = ("Exhausted all hosts available for retrying build "
792 "failures for instance %(instance_uuid)s." %
793 {"instance_uuid": instance.uuid})
794 exc = exception.MaxRetriesExceeded(reason=msg)
795 self._cleanup_when_reschedule_fails(
796 context, instance, exc, legacy_request_spec,
797 requested_networks)
798 return
800 # The availability_zone field was added in v1.1 of the Selection
801 # object so make sure to handle the case where it is missing.
802 if 'availability_zone' in host: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true
803 instance.availability_zone = host.availability_zone
804 else:
805 try:
806 instance.availability_zone = (
807 availability_zones.get_host_availability_zone(context,
808 host.service_host))
809 except Exception as exc:
810 # Put the instance into ERROR state, set task_state to
811 # None, inject a fault, etc.
812 self._cleanup_when_reschedule_fails(
813 context, instance, exc, legacy_request_spec,
814 requested_networks)
815 continue
817 try:
818 # NOTE(danms): This saves the az change above, refreshes our
819 # instance, and tells us if it has been deleted underneath us
820 instance.save()
821 except (exception.InstanceNotFound,
822 exception.InstanceInfoCacheNotFound):
823 LOG.debug('Instance deleted during build', instance=instance)
824 continue
825 local_filter_props = copy.deepcopy(filter_properties)
826 scheduler_utils.populate_filter_properties(local_filter_props,
827 host)
828 # Populate the request_spec with the local_filter_props information
829 # like retries and limits. Note that at this point the request_spec
830 # could have come from a compute via reschedule and it would
831 # already have some things set, like scheduler_hints.
832 local_reqspec = objects.RequestSpec.from_primitives(
833 context, legacy_request_spec, local_filter_props)
835 # NOTE(gibi): at this point the request spec already got converted
836 # to a legacy dict and then back to an object so we lost the non
837 # legacy part of the spec. Re-populate the requested_resources
838 # field based on the original request spec object passed to this
839 # function.
840 if request_spec:
841 local_reqspec.requested_resources = (
842 request_spec.requested_resources)
844 # The block_device_mapping passed from the api doesn't contain
845 # instance specific information
846 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
847 context, instance.uuid)
849 # This is populated in scheduler_utils.populate_retry
850 num_attempts = local_filter_props.get('retry',
851 {}).get('num_attempts', 1)
852 if num_attempts <= 1:
853 # If this is a reschedule the instance is already mapped to
854 # this cell and the BuildRequest is already deleted so ignore
855 # the logic below.
856 inst_mapping = self._populate_instance_mapping(context,
857 instance,
858 host)
859 try:
860 self._destroy_build_request(context, instance)
861 except exception.BuildRequestNotFound:
862 # This indicates an instance delete has been requested in
863 # the API. Stop the build, cleanup the instance_mapping and
864 # potentially the block_device_mappings
865 # TODO(alaski): Handle block_device_mapping cleanup
866 if inst_mapping:
867 inst_mapping.destroy()
868 return
869 else:
870 # NOTE(lyarwood): If this is a reschedule then recreate any
871 # attachments that were previously removed when cleaning up
872 # after failures to spawn etc.
873 self._validate_existing_attachment_ids(context, instance, bdms)
875 alts = [(alt.service_host, alt.nodename) for alt in host_list]
876 LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s",
877 host.service_host, host.nodename, alts, instance=instance)
879 try:
880 accel_uuids = self._create_and_bind_arq_for_instance(
881 context, instance, host.nodename, local_reqspec,
882 requested_networks)
883 except Exception as exc:
884 LOG.exception('Failed to reschedule. Reason: %s', exc)
885 self._cleanup_when_reschedule_fails(
886 context, instance, exc, legacy_request_spec,
887 requested_networks)
888 continue
890 self.compute_rpcapi.build_and_run_instance(context,
891 instance=instance, host=host.service_host, image=image,
892 request_spec=local_reqspec,
893 filter_properties=local_filter_props,
894 admin_password=admin_password,
895 injected_files=injected_files,
896 requested_networks=requested_networks,
897 security_groups=security_groups,
898 block_device_mapping=bdms, node=host.nodename,
899 limits=host.limits, host_list=host_list,
900 accel_uuids=accel_uuids)
902 def _create_and_bind_arq_for_instance(
903 self, context, instance, hostname,
904 request_spec, requested_networks=None):
905 try:
906 resource_provider_mapping = (
907 request_spec.get_request_group_mapping())
908 # Using nodename instead of hostname. See:
909 # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
910 cyclient = cyborg.get_client(context)
911 bindings = {}
912 port_bindings = {}
914 # Create ARQs comes from extra specs.
915 bindings = self._create_and_bind_arqs(
916 cyclient, instance.uuid, instance.flavor.extra_specs,
917 hostname, resource_provider_mapping)
919 if requested_networks:
920 # Create ARQs comes from port device profile
921 port_bindings = self._create_arqs_for_ports(
922 cyclient, instance.uuid, requested_networks,
923 hostname, resource_provider_mapping)
925 # Initiate Cyborg binding asynchronously
926 bindings.update(port_bindings)
927 if bindings:
928 cyclient.bind_arqs(bindings)
930 return list(bindings.keys())
932 except exception.AcceleratorRequestBindingFailed as exc:
933 # If anything failed here we need to cleanup and bail out.
934 cyclient = cyborg.get_client(context)
935 cyclient.delete_arqs_by_uuid(exc.arqs)
936 raise
938 def _schedule_instances(self, context, request_spec,
939 instance_uuids=None, return_alternates=False):
940 scheduler_utils.setup_instance_group(context, request_spec)
941 with timeutils.StopWatch() as timer:
942 host_lists = self.query_client.select_destinations(
943 context, request_spec, instance_uuids, return_objects=True,
944 return_alternates=return_alternates)
945 LOG.debug('Took %0.2f seconds to select destinations for %s '
946 'instance(s).', timer.elapsed(), len(instance_uuids))
947 return host_lists
949 @staticmethod
950 def _restrict_request_spec_to_cell(context, instance, request_spec):
951 """Sets RequestSpec.requested_destination.cell for the move operation
953 Move operations, e.g. evacuate and unshelve, must be restricted to the
954 cell in which the instance already exists, so this method is used to
955 target the RequestSpec, which is sent to the scheduler via the
956 _schedule_instances method, to the instance's current cell.
958 :param context: nova auth RequestContext
959 """
960 instance_mapping = \
961 objects.InstanceMapping.get_by_instance_uuid(
962 context, instance.uuid)
963 LOG.debug('Requesting cell %(cell)s during scheduling',
964 {'cell': instance_mapping.cell_mapping.identity},
965 instance=instance)
966 if ('requested_destination' in request_spec and
967 request_spec.requested_destination):
968 request_spec.requested_destination.cell = (
969 instance_mapping.cell_mapping)
970 else:
971 request_spec.requested_destination = (
972 objects.Destination(
973 cell=instance_mapping.cell_mapping))
975 def _recheck_quota(
976 self,
977 context: nova_context.RequestContext,
978 flavor: 'objects.Flavor',
979 request_spec: 'objects.RequestSpec',
980 orig_num_req: int,
981 project_id: ty.Optional[str] = None,
982 user_id: ty.Optional[str] = None
983 ) -> None:
984 # A quota "recheck" is a quota check that is performed *after* quota
985 # limited resources are consumed. It is meant to address race
986 # conditions where a request that was not over quota at the beginning
987 # of the request before resources are allocated becomes over quota
988 # after resources (like database rows or placement allocations) are
989 # created. An example of this would be a large number of requests for
990 # the same resource for the same project sent simultaneously.
991 if CONF.quota.recheck_quota:
992 # The orig_num_req is the number of instances requested, which is
993 # the delta that was quota checked before resources were allocated.
994 # This is only used for the exception message is the recheck fails
995 # for lack of enough quota.
996 compute_utils.check_num_instances_quota(
997 context, flavor, 0, 0, project_id=project_id,
998 user_id=user_id, orig_num_req=orig_num_req)
999 placement_limits.enforce_num_instances_and_flavor(
1000 context, project_id, flavor, request_spec.is_bfv, 0, 0)
1002 # TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0.
1003 @targets_cell
1004 def unshelve_instance(self, context, instance, request_spec=None):
1005 sys_meta = instance.system_metadata
1007 def safe_image_show(ctx, image_id):
1008 if image_id: 1008 ↛ 1011line 1008 didn't jump to line 1011 because the condition on line 1008 was always true
1009 return self.image_api.get(ctx, image_id, show_deleted=False)
1010 else:
1011 raise exception.ImageNotFound(image_id='')
1013 if instance.vm_state == vm_states.SHELVED:
1014 instance.task_state = task_states.POWERING_ON
1015 instance.save(expected_task_state=task_states.UNSHELVING)
1016 self.compute_rpcapi.start_instance(context, instance)
1017 elif instance.vm_state == vm_states.SHELVED_OFFLOADED: 1017 ↛ 1155line 1017 didn't jump to line 1155 because the condition on line 1017 was always true
1018 image = None
1019 image_id = sys_meta.get('shelved_image_id')
1020 # No need to check for image if image_id is None as
1021 # "shelved_image_id" key is not set for volume backed
1022 # instance during the shelve process
1023 if image_id:
1024 with compute_utils.EventReporter(
1025 context, 'get_image_info', self.host, instance.uuid):
1026 try:
1027 image = safe_image_show(context, image_id)
1028 except exception.ImageNotFound as error:
1029 instance.vm_state = vm_states.ERROR
1030 instance.save()
1032 reason = _('Unshelve attempted but the image %s '
1033 'cannot be found.') % image_id
1035 LOG.error(reason, instance=instance)
1036 compute_utils.add_instance_fault_from_exc(
1037 context, instance, error, sys.exc_info(),
1038 fault_message=reason)
1039 raise exception.UnshelveException(
1040 instance_id=instance.uuid, reason=reason)
1042 try:
1043 with compute_utils.EventReporter(context, 'schedule_instances',
1044 self.host, instance.uuid):
1045 # NOTE(sbauza): Force_hosts/nodes needs to be reset
1046 # if we want to make sure that the next destination
1047 # is not forced to be the original host
1048 request_spec.reset_forced_destinations()
1049 # TODO(sbauza): Provide directly the RequestSpec object
1050 # when populate_filter_properties accepts it
1051 filter_properties = request_spec.\
1052 to_legacy_filter_properties_dict()
1053 res_req, req_lvl_params = (
1054 self.network_api.get_requested_resource_for_instance(
1055 context, instance.uuid)
1056 )
1057 extra_specs = request_spec.flavor.extra_specs
1058 device_profile = extra_specs.get('accel:device_profile')
1059 res_req.extend(
1060 cyborg.get_device_profile_request_groups(
1061 context, device_profile) if device_profile else [])
1062 # NOTE(gibi): When other modules want to handle similar
1063 # non-nova resources then here we have to collect all
1064 # the external resource requests in a single list and
1065 # add them to the RequestSpec.
1066 request_spec.requested_resources = res_req
1067 request_spec.request_level_params = req_lvl_params
1069 # NOTE(gibi): as PCI devices is tracked in placement we
1070 # need to generate request groups from InstancePCIRequests.
1071 # This will append new RequestGroup objects to the
1072 # request_spec.requested_resources list if needed
1073 request_spec.generate_request_groups_from_pci_requests()
1075 # NOTE(cfriesen): Ensure that we restrict the scheduler to
1076 # the cell specified by the instance mapping.
1077 self._restrict_request_spec_to_cell(
1078 context, instance, request_spec)
1080 request_spec.ensure_project_and_user_id(instance)
1081 request_spec.ensure_network_information(instance)
1082 compute_utils.heal_reqspec_is_bfv(
1083 context, request_spec, instance)
1084 host_lists = self._schedule_instances(context,
1085 request_spec, [instance.uuid],
1086 return_alternates=False)
1088 # NOTE(melwitt): We recheck the quota after allocating the
1089 # resources in placement, to prevent users from allocating
1090 # more resources than their allowed quota in the event of a
1091 # race. This is configurable because it can be expensive if
1092 # strict quota limits are not required in a deployment.
1093 try:
1094 # Quota should only be checked for unshelve only if
1095 # resources are being counted in placement. Legacy
1096 # quotas continue to consume resources while
1097 # SHELVED_OFFLOADED and will not allocate any new
1098 # resources during unshelve.
1099 if (CONF.quota.count_usage_from_placement or
1100 limit_utils.use_unified_limits()):
1101 self._recheck_quota(
1102 context, instance.flavor, request_spec, 0,
1103 project_id=instance.project_id,
1104 user_id=instance.user_id)
1105 except (exception.TooManyInstances,
1106 limit_exceptions.ProjectOverLimit):
1107 with excutils.save_and_reraise_exception():
1108 self.report_client.delete_allocation_for_instance(
1109 context, instance.uuid, force=True)
1111 host_list = host_lists[0]
1112 selection = host_list[0]
1113 scheduler_utils.populate_filter_properties(
1114 filter_properties, selection)
1115 (host, node) = (selection.service_host, selection.nodename)
1116 LOG.debug(
1117 "Scheduler selected host: %s, node:%s",
1118 host,
1119 node,
1120 instance=instance
1121 )
1122 instance.availability_zone = (
1123 availability_zones.get_host_availability_zone(
1124 context, host))
1126 scheduler_utils.fill_provider_mapping(
1127 request_spec, selection)
1129 # NOTE(brinzhang): For unshelve operation we should
1130 # re-create-and-bound the arqs for the instance.
1131 accel_uuids = self._create_and_bind_arq_for_instance(
1132 context, instance, node, request_spec)
1133 self.compute_rpcapi.unshelve_instance(
1134 context, instance, host, request_spec, image=image,
1135 filter_properties=filter_properties, node=node,
1136 accel_uuids=accel_uuids)
1137 except (exception.NoValidHost,
1138 exception.UnsupportedPolicyException):
1139 instance.task_state = None
1140 instance.save()
1141 LOG.warning("No valid host found for unshelve instance",
1142 instance=instance)
1143 return
1144 except Exception as exc:
1145 if isinstance(exc, exception.AcceleratorRequestBindingFailed):
1146 cyclient = cyborg.get_client(context)
1147 cyclient.delete_arqs_by_uuid(exc.arqs)
1148 LOG.exception('Failed to unshelve. Reason: %s', exc)
1149 with excutils.save_and_reraise_exception():
1150 instance.task_state = None
1151 instance.save()
1152 LOG.error("Unshelve attempted but an error "
1153 "has occurred", instance=instance)
1154 else:
1155 LOG.error('Unshelve attempted but vm_state not SHELVED or '
1156 'SHELVED_OFFLOADED', instance=instance)
1157 instance.vm_state = vm_states.ERROR
1158 instance.save()
1159 return
1161 def _allocate_for_evacuate_dest_host(self, context, instance, host,
1162 request_spec=None):
1163 # The user is forcing the destination host and bypassing the
1164 # scheduler. We need to copy the source compute node
1165 # allocations in Placement to the destination compute node.
1166 # Normally select_destinations() in the scheduler would do this
1167 # for us, but when forcing the target host we don't call the
1168 # scheduler.
1169 source_node = None # This is used for error handling below.
1170 try:
1171 source_node = objects.ComputeNode.get_by_host_and_nodename(
1172 context, instance.host, instance.node)
1173 dest_node = (
1174 objects.ComputeNode.get_first_node_by_host_for_old_compat(
1175 context, host, use_slave=True))
1176 except exception.ComputeHostNotFound as ex:
1177 with excutils.save_and_reraise_exception():
1178 self._set_vm_state_and_notify(
1179 context, instance.uuid, 'rebuild_server',
1180 {'vm_state': instance.vm_state,
1181 'task_state': None}, ex, request_spec)
1182 if source_node:
1183 LOG.warning('Specified host %s for evacuate was not '
1184 'found.', host, instance=instance)
1185 else:
1186 LOG.warning('Source host %s and node %s for evacuate was '
1187 'not found.', instance.host, instance.node,
1188 instance=instance)
1190 try:
1191 scheduler_utils.claim_resources_on_destination(
1192 context, self.report_client, instance, source_node, dest_node)
1193 except exception.NoValidHost as ex:
1194 with excutils.save_and_reraise_exception():
1195 self._set_vm_state_and_notify(
1196 context, instance.uuid, 'rebuild_server',
1197 {'vm_state': instance.vm_state,
1198 'task_state': None}, ex, request_spec)
1199 LOG.warning('Specified host %s for evacuate is '
1200 'invalid.', host, instance=instance)
1202 # TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0.
1203 @targets_cell
1204 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
1205 injected_files, new_pass, orig_sys_metadata,
1206 bdms, recreate, on_shared_storage,
1207 preserve_ephemeral=False, host=None,
1208 request_spec=None, reimage_boot_volume=False,
1209 target_state=None):
1210 # recreate=True means the instance is being evacuated from a failed
1211 # host to a new destination host. The 'recreate' variable name is
1212 # confusing, so rename it to evacuate here at the top, which is simpler
1213 # than renaming a parameter in an RPC versioned method.
1214 evacuate = recreate
1216 # NOTE(efried): It would be nice if this were two separate events, one
1217 # for 'rebuild' and one for 'evacuate', but this is part of the API
1218 # now, so it would be nontrivial to change.
1219 with compute_utils.EventReporter(context, 'rebuild_server',
1220 self.host, instance.uuid):
1221 node = limits = None
1223 try:
1224 migration = objects.Migration.get_by_instance_and_status(
1225 context, instance.uuid, 'accepted')
1226 except exception.MigrationNotFoundByStatus:
1227 LOG.debug("No migration record for the rebuild/evacuate "
1228 "request.", instance=instance)
1229 migration = None
1231 # The host variable is passed in two cases:
1232 # 1. rebuild - the instance.host is passed to rebuild on the
1233 # same host and bypass the scheduler *unless* a new image
1234 # was specified
1235 # 2. evacuate with specified host and force=True - the specified
1236 # host is passed and is meant to bypass the scheduler.
1237 # NOTE(mriedem): This could be a lot more straight-forward if we
1238 # had separate methods for rebuild and evacuate...
1239 if host:
1240 # We only create a new allocation on the specified host if
1241 # we're doing an evacuate since that is a move operation.
1242 if host != instance.host: 1242 ↛ 1245line 1242 didn't jump to line 1245 because the condition on line 1242 was never true
1243 # If a destination host is forced for evacuate, create
1244 # allocations against it in Placement.
1245 try:
1246 self._allocate_for_evacuate_dest_host(
1247 context, instance, host, request_spec)
1248 except exception.AllocationUpdateFailed as ex:
1249 with excutils.save_and_reraise_exception():
1250 if migration:
1251 migration.status = 'error'
1252 migration.save()
1253 # NOTE(efried): It would be nice if this were two
1254 # separate events, one for 'rebuild' and one for
1255 # 'evacuate', but this is part of the API now, so
1256 # it would be nontrivial to change.
1257 self._set_vm_state_and_notify(
1258 context,
1259 instance.uuid,
1260 'rebuild_server',
1261 {'vm_state': vm_states.ERROR,
1262 'task_state': None}, ex, request_spec)
1263 LOG.warning('Rebuild failed: %s',
1264 str(ex), instance=instance)
1265 except exception.NoValidHost:
1266 with excutils.save_and_reraise_exception():
1267 if migration:
1268 migration.status = 'error'
1269 migration.save()
1270 else:
1271 # At this point, the user is either:
1272 #
1273 # 1. Doing a rebuild on the same host (not evacuate) and
1274 # specified a new image.
1275 # 2. Evacuating and specified a host but are not forcing it.
1276 #
1277 # In either case, the API passes host=None but sets up the
1278 # RequestSpec.requested_destination field for the specified
1279 # host.
1280 if evacuate:
1281 # NOTE(sbauza): Augment the RequestSpec object by excluding
1282 # the source host for avoiding the scheduler to pick it
1283 request_spec.ignore_hosts = [instance.host]
1284 # NOTE(sbauza): Force_hosts/nodes needs to be reset
1285 # if we want to make sure that the next destination
1286 # is not forced to be the original host
1287 request_spec.reset_forced_destinations()
1288 res_req, req_lvl_params = (
1289 self.network_api.get_requested_resource_for_instance(
1290 context, instance.uuid)
1291 )
1292 extra_specs = request_spec.flavor.extra_specs
1293 device_profile = extra_specs.get('accel:device_profile')
1294 res_req.extend(
1295 cyborg.get_device_profile_request_groups(
1296 context, device_profile)
1297 if device_profile else [])
1298 # NOTE(gibi): When other modules want to handle similar
1299 # non-nova resources then here we have to collect all
1300 # the external resource requests in a single list and
1301 # add them to the RequestSpec.
1302 request_spec.requested_resources = res_req
1303 request_spec.request_level_params = req_lvl_params
1305 # NOTE(gibi): as PCI devices is tracked in placement we
1306 # need to generate request groups from InstancePCIRequests.
1307 # This will append new RequestGroup objects to the
1308 # request_spec.requested_resources list if needed
1309 request_spec.generate_request_groups_from_pci_requests()
1311 try:
1312 # if this is a rebuild of instance on the same host with
1313 # new image.
1314 if not evacuate and orig_image_ref != image_ref:
1315 self._validate_image_traits_for_rebuild(context,
1316 instance,
1317 image_ref)
1318 self._restrict_request_spec_to_cell(
1319 context, instance, request_spec)
1320 request_spec.ensure_project_and_user_id(instance)
1321 request_spec.ensure_network_information(instance)
1322 compute_utils.heal_reqspec_is_bfv(
1323 context, request_spec, instance)
1325 host_lists = self._schedule_instances(context,
1326 request_spec, [instance.uuid],
1327 return_alternates=False)
1328 host_list = host_lists[0]
1329 selection = host_list[0]
1330 host, node, limits = (selection.service_host,
1331 selection.nodename, selection.limits)
1333 if recreate:
1334 scheduler_utils.fill_provider_mapping(
1335 request_spec, selection)
1337 except (exception.NoValidHost,
1338 exception.UnsupportedPolicyException,
1339 exception.AllocationUpdateFailed,
1340 # the next two can come from fill_provider_mapping and
1341 # signals a software error.
1342 NotImplementedError,
1343 ValueError) as ex:
1344 if migration:
1345 migration.status = 'error'
1346 migration.save()
1347 # Rollback the image_ref if a new one was provided (this
1348 # only happens in the rebuild case, not evacuate).
1349 if orig_image_ref and orig_image_ref != image_ref: 1349 ↛ 1352line 1349 didn't jump to line 1352 because the condition on line 1349 was always true
1350 instance.image_ref = orig_image_ref
1351 instance.save()
1352 with excutils.save_and_reraise_exception():
1353 # NOTE(efried): It would be nice if this were two
1354 # separate events, one for 'rebuild' and one for
1355 # 'evacuate', but this is part of the API now, so it
1356 # would be nontrivial to change.
1357 self._set_vm_state_and_notify(context, instance.uuid,
1358 'rebuild_server',
1359 {'vm_state': vm_states.ERROR,
1360 'task_state': None}, ex, request_spec)
1361 LOG.warning('Rebuild failed: %s',
1362 str(ex), instance=instance)
1364 compute_utils.notify_about_instance_usage(
1365 self.notifier, context, instance, "rebuild.scheduled")
1366 compute_utils.notify_about_instance_rebuild(
1367 context, instance, host,
1368 action=fields.NotificationAction.REBUILD_SCHEDULED,
1369 source=fields.NotificationSource.CONDUCTOR)
1371 instance.availability_zone = (
1372 availability_zones.get_host_availability_zone(
1373 context, host))
1374 accel_uuids = []
1375 try:
1376 if instance.flavor.extra_specs.get('accel:device_profile'):
1377 cyclient = cyborg.get_client(context)
1378 if evacuate:
1379 # NOTE(brinzhang): For evacuate operation we should
1380 # delete the bound arqs, then re-create-and-bound the
1381 # arqs for the instance.
1382 cyclient.delete_arqs_for_instance(instance.uuid)
1383 accel_uuids = self._create_and_bind_arq_for_instance(
1384 context, instance, node, request_spec)
1385 else:
1386 accel_uuids = cyclient.get_arq_uuids_for_instance(
1387 instance)
1388 except Exception as exc:
1389 if isinstance(exc, exception.AcceleratorRequestBindingFailed): 1389 ↛ 1392line 1389 didn't jump to line 1392 because the condition on line 1389 was always true
1390 cyclient = cyborg.get_client(context)
1391 cyclient.delete_arqs_by_uuid(exc.arqs)
1392 LOG.exception('Failed to rebuild. Reason: %s', exc)
1393 raise exc
1395 self.compute_rpcapi.rebuild_instance(
1396 context,
1397 instance=instance,
1398 new_pass=new_pass,
1399 injected_files=injected_files,
1400 image_ref=image_ref,
1401 orig_image_ref=orig_image_ref,
1402 orig_sys_metadata=orig_sys_metadata,
1403 bdms=bdms,
1404 recreate=evacuate,
1405 on_shared_storage=on_shared_storage,
1406 preserve_ephemeral=preserve_ephemeral,
1407 migration=migration,
1408 host=host,
1409 node=node,
1410 limits=limits,
1411 request_spec=request_spec,
1412 accel_uuids=accel_uuids,
1413 reimage_boot_volume=reimage_boot_volume,
1414 target_state=target_state)
1416 def _validate_image_traits_for_rebuild(self, context, instance, image_ref):
1417 """Validates that the traits specified in the image can be satisfied
1418 by the providers of the current allocations for the instance during
1419 rebuild of the instance. If the traits cannot be
1420 satisfied, fails the action by raising a NoValidHost exception.
1422 :raises: NoValidHost exception in case the traits on the providers
1423 of the allocated resources for the instance do not match
1424 the required traits on the image.
1425 """
1426 image_meta = objects.ImageMeta.from_image_ref(
1427 context, self.image_api, image_ref)
1428 if ('properties' not in image_meta or 1428 ↛ 1433line 1428 didn't jump to line 1433 because the condition on line 1428 was always true
1429 'traits_required' not in image_meta.properties or not
1430 image_meta.properties.traits_required):
1431 return
1433 image_traits = set(image_meta.properties.traits_required)
1435 # check any of the image traits are forbidden in flavor traits.
1436 # if so raise an exception
1437 extra_specs = instance.flavor.extra_specs
1438 forbidden_flavor_traits = set()
1439 for key, val in extra_specs.items():
1440 if key.startswith('trait'):
1441 # get the actual key.
1442 prefix, parsed_key = key.split(':', 1)
1443 if val == 'forbidden':
1444 forbidden_flavor_traits.add(parsed_key)
1446 forbidden_traits = image_traits & forbidden_flavor_traits
1448 if forbidden_traits:
1449 raise exception.NoValidHost(
1450 reason=_("Image traits are part of forbidden "
1451 "traits in flavor associated with the server. "
1452 "Either specify a different image during rebuild "
1453 "or create a new server with the specified image "
1454 "and a compatible flavor."))
1456 # If image traits are present, then validate against allocations.
1457 allocations = self.report_client.get_allocations_for_consumer(
1458 context, instance.uuid)
1459 instance_rp_uuids = list(allocations)
1461 # Get provider tree for the instance. We use the uuid of the host
1462 # on which the instance is rebuilding to get the provider tree.
1463 compute_node = objects.ComputeNode.get_by_host_and_nodename(
1464 context, instance.host, instance.node)
1466 # TODO(karimull): Call with a read-only version, when available.
1467 instance_rp_tree = (
1468 self.report_client.get_provider_tree_and_ensure_root(
1469 context, compute_node.uuid))
1471 traits_in_instance_rps = set()
1473 for rp_uuid in instance_rp_uuids:
1474 traits_in_instance_rps.update(
1475 instance_rp_tree.data(rp_uuid).traits)
1477 missing_traits = image_traits - traits_in_instance_rps
1479 if missing_traits:
1480 raise exception.NoValidHost(
1481 reason=_("Image traits cannot be "
1482 "satisfied by the current resource providers. "
1483 "Either specify a different image during rebuild "
1484 "or create a new server with the specified image."))
1486 # TODO(avolkov): move method to bdm
1487 @staticmethod
1488 def _volume_size(flavor, bdm):
1489 size = bdm.get('volume_size')
1490 # NOTE (ndipanov): inherit flavor size only for swap and ephemeral
1491 if (size is None and bdm.get('source_type') == 'blank' and 1491 ↛ 1493line 1491 didn't jump to line 1493 because the condition on line 1491 was never true
1492 bdm.get('destination_type') == 'local'):
1493 if bdm.get('guest_format') == 'swap':
1494 size = flavor.get('swap', 0)
1495 else:
1496 size = flavor.get('ephemeral_gb', 0)
1497 return size
1499 def _create_block_device_mapping(self, cell, flavor, instance_uuid,
1500 block_device_mapping):
1501 """Create the BlockDeviceMapping objects in the db.
1503 This method makes a copy of the list in order to avoid using the same
1504 id field in case this is called for multiple instances.
1505 """
1506 LOG.debug("block_device_mapping %s", list(block_device_mapping),
1507 instance_uuid=instance_uuid)
1508 instance_block_device_mapping = copy.deepcopy(block_device_mapping)
1509 for bdm in instance_block_device_mapping:
1510 bdm.volume_size = self._volume_size(flavor, bdm)
1511 bdm.instance_uuid = instance_uuid
1512 with obj_target_cell(bdm, cell):
1513 bdm.update_or_create()
1514 return instance_block_device_mapping
1516 def _create_tags(self, context, instance_uuid, tags):
1517 """Create the Tags objects in the db."""
1518 if tags:
1519 tag_list = [tag.tag for tag in tags]
1520 instance_tags = objects.TagList.create(
1521 context, instance_uuid, tag_list)
1522 return instance_tags
1523 else:
1524 return tags
1526 def _create_instance_action_for_cell0(self, context, instance, exc):
1527 """Create a failed "create" instance action for the instance in cell0.
1529 :param context: nova auth RequestContext targeted at cell0
1530 :param instance: Instance object being buried in cell0
1531 :param exc: Exception that occurred which resulted in burial
1532 """
1533 # First create the action record.
1534 objects.InstanceAction.action_start(
1535 context, instance.uuid, instance_actions.CREATE, want_result=False)
1536 # Now create an event for that action record.
1537 event_name = 'conductor_schedule_and_build_instances'
1538 objects.InstanceActionEvent.event_start(
1539 context, instance.uuid, event_name, want_result=False,
1540 host=self.host)
1541 # And finish the event with the exception. Note that we expect this
1542 # method to be called from _bury_in_cell0 which is called from within
1543 # an exception handler so sys.exc_info should return values but if not
1544 # it's not the end of the world - this is best effort.
1545 objects.InstanceActionEvent.event_finish_with_failure(
1546 context, instance.uuid, event_name, exc_val=exc,
1547 exc_tb=sys.exc_info()[2], want_result=False)
1549 def _bury_in_cell0(self, context, request_spec, exc,
1550 build_requests=None, instances=None,
1551 block_device_mapping=None,
1552 tags=None):
1553 """Ensure all provided build_requests and instances end up in cell0.
1555 Cell0 is the fake cell we schedule dead instances to when we can't
1556 schedule them somewhere real. Requests that don't yet have instances
1557 will get a new instance, created in cell0. Instances that have not yet
1558 been created will be created in cell0. All build requests are destroyed
1559 after we're done. Failure to delete a build request will trigger the
1560 instance deletion, just like the happy path in
1561 schedule_and_build_instances() below.
1562 """
1563 try:
1564 cell0 = objects.CellMapping.get_by_uuid(
1565 context, objects.CellMapping.CELL0_UUID)
1566 except exception.CellMappingNotFound:
1567 # Not yet setup for cellsv2. Instances will need to be written
1568 # to the configured database. This will become a deployment
1569 # error in Ocata.
1570 LOG.error('No cell mapping found for cell0 while '
1571 'trying to record scheduling failure. '
1572 'Setup is incomplete.')
1573 return
1575 build_requests = build_requests or []
1576 instances = instances or []
1577 instances_by_uuid = {inst.uuid: inst for inst in instances}
1578 for build_request in build_requests:
1579 if build_request.instance_uuid not in instances_by_uuid:
1580 # This is an instance object with no matching db entry.
1581 instance = build_request.get_new_instance(context)
1582 instances_by_uuid[instance.uuid] = instance
1584 updates = {'vm_state': vm_states.ERROR, 'task_state': None}
1585 for instance in instances_by_uuid.values():
1587 inst_mapping = None
1588 try:
1589 # We don't need the cell0-targeted context here because the
1590 # instance mapping is in the API DB.
1591 inst_mapping = \
1592 objects.InstanceMapping.get_by_instance_uuid(
1593 context, instance.uuid)
1594 except exception.InstanceMappingNotFound:
1595 # The API created the instance mapping record so it should
1596 # definitely be here. Log an error but continue to create the
1597 # instance in the cell0 database.
1598 LOG.error('While burying instance in cell0, no instance '
1599 'mapping was found.', instance=instance)
1601 # Perform a final sanity check that the instance is not mapped
1602 # to some other cell already because of maybe some crazy
1603 # clustered message queue weirdness.
1604 if inst_mapping and inst_mapping.cell_mapping is not None:
1605 LOG.error('When attempting to bury instance in cell0, the '
1606 'instance is already mapped to cell %s. Ignoring '
1607 'bury in cell0 attempt.',
1608 inst_mapping.cell_mapping.identity,
1609 instance=instance)
1610 continue
1612 with obj_target_cell(instance, cell0) as cctxt:
1613 instance.create()
1614 if inst_mapping:
1615 inst_mapping.cell_mapping = cell0
1616 inst_mapping.save()
1618 # Record an instance action with a failed event.
1619 self._create_instance_action_for_cell0(
1620 cctxt, instance, exc)
1622 # NOTE(mnaser): In order to properly clean-up volumes after
1623 # being buried in cell0, we need to store BDMs.
1624 if block_device_mapping:
1625 self._create_block_device_mapping(
1626 cell0, instance.flavor, instance.uuid,
1627 block_device_mapping)
1629 self._create_tags(cctxt, instance.uuid, tags)
1631 # Use the context targeted to cell0 here since the instance is
1632 # now in cell0.
1633 self._set_vm_state_and_notify(
1634 cctxt, instance.uuid, 'build_instances', updates,
1635 exc, request_spec)
1637 for build_request in build_requests:
1638 try:
1639 build_request.destroy()
1640 except exception.BuildRequestNotFound:
1641 # Instance was deleted before we finished scheduling
1642 inst = instances_by_uuid[build_request.instance_uuid]
1643 with obj_target_cell(inst, cell0):
1644 inst.destroy()
1646 def schedule_and_build_instances(self, context, build_requests,
1647 request_specs, image,
1648 admin_password, injected_files,
1649 requested_networks, block_device_mapping,
1650 tags=None):
1651 # Add all the UUIDs for the instances
1652 instance_uuids = [spec.instance_uuid for spec in request_specs]
1653 try:
1654 host_lists = self._schedule_instances(context, request_specs[0],
1655 instance_uuids, return_alternates=True)
1656 except Exception as exc:
1657 LOG.exception('Failed to schedule instances')
1658 self._bury_in_cell0(context, request_specs[0], exc,
1659 build_requests=build_requests,
1660 block_device_mapping=block_device_mapping,
1661 tags=tags)
1662 return
1664 host_mapping_cache = {}
1665 cell_mapping_cache = {}
1666 instances = []
1667 host_az = {} # host=az cache to optimize multi-create
1669 for (build_request, request_spec, host_list) in zip(
1670 build_requests, request_specs, host_lists):
1671 instance = build_request.get_new_instance(context)
1672 # host_list is a list of one or more Selection objects, the first
1673 # of which has been selected and its resources claimed.
1674 host = host_list[0]
1675 # Convert host from the scheduler into a cell record
1676 if host.service_host not in host_mapping_cache:
1677 try:
1678 host_mapping = objects.HostMapping.get_by_host(
1679 context, host.service_host)
1680 host_mapping_cache[host.service_host] = host_mapping
1681 except exception.HostMappingNotFound as exc:
1682 LOG.error('No host-to-cell mapping found for selected '
1683 'host %(host)s. Setup is incomplete.',
1684 {'host': host.service_host})
1685 self._bury_in_cell0(
1686 context, request_spec, exc,
1687 build_requests=[build_request], instances=[instance],
1688 block_device_mapping=block_device_mapping,
1689 tags=tags)
1690 # This is a placeholder in case the quota recheck fails.
1691 instances.append(None)
1692 continue
1693 else:
1694 host_mapping = host_mapping_cache[host.service_host]
1696 cell = host_mapping.cell_mapping
1698 # Before we create the instance, let's make one final check that
1699 # the build request is still around and wasn't deleted by the user
1700 # already.
1701 try:
1702 objects.BuildRequest.get_by_instance_uuid(
1703 context, instance.uuid)
1704 except exception.BuildRequestNotFound:
1705 # the build request is gone so we're done for this instance
1706 LOG.debug('While scheduling instance, the build request '
1707 'was already deleted.', instance=instance)
1708 # This is a placeholder in case the quota recheck fails.
1709 instances.append(None)
1710 # If the build request was deleted and the instance is not
1711 # going to be created, there is on point in leaving an orphan
1712 # instance mapping so delete it.
1713 try:
1714 im = objects.InstanceMapping.get_by_instance_uuid(
1715 context, instance.uuid)
1716 im.destroy()
1717 except exception.InstanceMappingNotFound:
1718 pass
1719 self.report_client.delete_allocation_for_instance(
1720 context, instance.uuid, force=True)
1721 continue
1722 else:
1723 if host.service_host not in host_az:
1724 host_az[host.service_host] = (
1725 availability_zones.get_host_availability_zone(
1726 context, host.service_host))
1727 instance.availability_zone = host_az[host.service_host]
1728 with obj_target_cell(instance, cell):
1729 instance.create()
1730 instances.append(instance)
1731 cell_mapping_cache[instance.uuid] = cell
1733 # NOTE(melwitt): We recheck the quota after allocating the
1734 # resources to prevent users from allocating more resources
1735 # than their allowed quota in the event of a race. This is
1736 # configurable because it can be expensive if strict quota
1737 # limits are not required in a deployment.
1738 try:
1739 self._recheck_quota(context, instance.flavor, request_specs[0],
1740 len(build_requests), project_id=instance.project_id,
1741 user_id=instance.user_id
1742 )
1743 except (exception.TooManyInstances,
1744 limit_exceptions.ProjectOverLimit) as exc:
1745 with excutils.save_and_reraise_exception():
1746 self._cleanup_build_artifacts(
1747 context, exc, instances, build_requests, request_specs,
1748 block_device_mapping, tags, cell_mapping_cache)
1750 zipped = zip(build_requests, request_specs, host_lists, instances)
1751 for (build_request, request_spec, host_list, instance) in zipped:
1752 if instance is None:
1753 # Skip placeholders that were buried in cell0 or had their
1754 # build requests deleted by the user before instance create.
1755 continue
1756 cell = cell_mapping_cache[instance.uuid]
1757 # host_list is a list of one or more Selection objects, the first
1758 # of which has been selected and its resources claimed.
1759 host = host_list.pop(0)
1760 alts = [(alt.service_host, alt.nodename) for alt in host_list]
1761 LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s",
1762 host.service_host, host.nodename, alts, instance=instance)
1763 filter_props = request_spec.to_legacy_filter_properties_dict()
1764 scheduler_utils.populate_retry(filter_props, instance.uuid)
1765 scheduler_utils.populate_filter_properties(filter_props,
1766 host)
1768 # Now that we have a selected host (which has claimed resource
1769 # allocations in the scheduler) for this instance, we may need to
1770 # map allocations to resource providers in the request spec.
1771 try:
1772 scheduler_utils.fill_provider_mapping(request_spec, host)
1773 except Exception as exc:
1774 # If anything failed here we need to cleanup and bail out.
1775 with excutils.save_and_reraise_exception():
1776 self._cleanup_build_artifacts(
1777 context, exc, instances, build_requests, request_specs,
1778 block_device_mapping, tags, cell_mapping_cache)
1780 # TODO(melwitt): Maybe we should set_target_cell on the contexts
1781 # once we map to a cell, and remove these separate with statements.
1782 with obj_target_cell(instance, cell) as cctxt:
1783 # send a state update notification for the initial create to
1784 # show it going from non-existent to BUILDING
1785 # This can lazy-load attributes on instance.
1786 notifications.send_update_with_states(cctxt, instance, None,
1787 vm_states.BUILDING, None, None, service="conductor")
1788 objects.InstanceAction.action_start(
1789 cctxt, instance.uuid, instance_actions.CREATE,
1790 want_result=False)
1791 instance_bdms = self._create_block_device_mapping(
1792 cell, instance.flavor, instance.uuid, block_device_mapping)
1793 instance_tags = self._create_tags(cctxt, instance.uuid, tags)
1795 # TODO(Kevin Zheng): clean this up once instance.create() handles
1796 # tags; we do this so the instance.create notification in
1797 # build_and_run_instance in nova-compute doesn't lazy-load tags
1798 instance.tags = instance_tags if instance_tags \
1799 else objects.TagList()
1801 # Update mapping for instance.
1802 self._map_instance_to_cell(context, instance, cell)
1804 if not self._delete_build_request(
1805 context, build_request, instance, cell, instance_bdms,
1806 instance_tags):
1807 # The build request was deleted before/during scheduling so
1808 # the instance is gone and we don't have anything to build for
1809 # this one.
1810 continue
1812 try:
1813 accel_uuids = self._create_and_bind_arq_for_instance(
1814 context, instance, host.nodename, request_spec,
1815 requested_networks)
1816 except Exception as exc:
1817 with excutils.save_and_reraise_exception():
1818 self._cleanup_build_artifacts(
1819 context, exc, instances, build_requests, request_specs,
1820 block_device_mapping, tags, cell_mapping_cache)
1822 # NOTE(danms): Compute RPC expects security group names or ids
1823 # not objects, so convert this to a list of names until we can
1824 # pass the objects.
1825 legacy_secgroups = [s.identifier
1826 for s in request_spec.security_groups]
1827 with obj_target_cell(instance, cell) as cctxt:
1828 self.compute_rpcapi.build_and_run_instance(
1829 cctxt, instance=instance, image=image,
1830 request_spec=request_spec,
1831 filter_properties=filter_props,
1832 admin_password=admin_password,
1833 injected_files=injected_files,
1834 requested_networks=requested_networks,
1835 security_groups=legacy_secgroups,
1836 block_device_mapping=instance_bdms,
1837 host=host.service_host, node=host.nodename,
1838 limits=host.limits, host_list=host_list,
1839 accel_uuids=accel_uuids)
1841 def _create_and_bind_arqs(
1842 self, cyclient, instance_uuid, extra_specs,
1843 hostname, resource_provider_mapping):
1844 """Create ARQs comes from extra specs, determine their RPs.
1846 The binding is asynchronous; Cyborg will notify on completion.
1847 The notification will be handled in the compute manager.
1848 """
1849 arqs = []
1850 bindings = {}
1851 dp_name = extra_specs.get('accel:device_profile')
1853 # profiles from request spec: Create ARQ and binding
1854 if not dp_name:
1855 # empty arq list and binding info
1856 return bindings
1858 LOG.debug('Calling Cyborg to get ARQs. dp_name=%s instance=%s',
1859 dp_name, instance_uuid)
1860 arqs = cyclient.create_arqs_and_match_resource_providers(
1861 dp_name, resource_provider_mapping)
1862 LOG.debug('Got ARQs with resource provider mapping %s', arqs)
1863 bindings = {
1864 arq['uuid']:
1865 {"hostname": hostname,
1866 "device_rp_uuid": arq['device_rp_uuid'],
1867 "instance_uuid": instance_uuid
1868 }
1869 for arq in arqs}
1871 return bindings
1873 def _create_arqs_for_ports(self, cyclient, instance_uuid,
1874 requested_networks,
1875 hostname, resource_provider_mapping):
1876 """Create ARQs for port with backend device profile.
1878 The binding is asynchronous; Cyborg will notify on completion.
1879 The notification will be handled in the compute manager.
1880 """
1881 bindings = {}
1883 for request_net in requested_networks:
1884 if request_net.port_id and request_net.device_profile: 1884 ↛ 1883line 1884 didn't jump to line 1883 because the condition on line 1884 was always true
1885 device_profile = request_net.device_profile
1886 # the port doesn't support multiple devices
1887 arqs = cyclient.create_arqs(device_profile)
1888 if len(arqs) > 1:
1889 raise exception.AcceleratorRequestOpFailed(
1890 op=_('create'),
1891 msg='the port does not support multiple devices.')
1892 arq = arqs[0]
1894 LOG.debug("Create ARQ %s for port %s of instance %s",
1895 arq["uuid"], request_net.port_id, instance_uuid)
1896 request_net.arq_uuid = arq["uuid"]
1898 rp_uuid = cyclient.get_arq_device_rp_uuid(
1899 arq,
1900 resource_provider_mapping,
1901 request_net.port_id)
1903 arq_binding = {request_net.arq_uuid:
1904 {"hostname": hostname,
1905 "device_rp_uuid": rp_uuid,
1906 "instance_uuid": instance_uuid}
1907 }
1908 LOG.debug("ARQ %s binding: %s", request_net.arq_uuid,
1909 arq_binding)
1910 bindings.update(arq_binding)
1912 return bindings
1914 @staticmethod
1915 def _map_instance_to_cell(context, instance, cell):
1916 """Update the instance mapping to point at the given cell.
1918 During initial scheduling once a host and cell is selected in which
1919 to build the instance this method is used to update the instance
1920 mapping to point at that cell.
1922 :param context: nova auth RequestContext
1923 :param instance: Instance object being built
1924 :param cell: CellMapping representing the cell in which the instance
1925 was created and is being built.
1926 :returns: InstanceMapping object that was updated.
1927 """
1928 inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
1929 context, instance.uuid)
1930 # Perform a final sanity check that the instance is not mapped
1931 # to some other cell already because of maybe some crazy
1932 # clustered message queue weirdness.
1933 if inst_mapping.cell_mapping is not None:
1934 LOG.error('During scheduling instance is already mapped to '
1935 'another cell: %s. This should not happen and is an '
1936 'indication of bigger problems. If you see this you '
1937 'should report it to the nova team. Overwriting '
1938 'the mapping to point at cell %s.',
1939 inst_mapping.cell_mapping.identity, cell.identity,
1940 instance=instance)
1941 inst_mapping.cell_mapping = cell
1942 inst_mapping.save()
1943 return inst_mapping
1945 def _cleanup_build_artifacts(self, context, exc, instances, build_requests,
1946 request_specs, block_device_mappings, tags,
1947 cell_mapping_cache):
1948 for (instance, build_request, request_spec) in zip(
1949 instances, build_requests, request_specs):
1950 # Skip placeholders that were buried in cell0 or had their
1951 # build requests deleted by the user before instance create.
1952 if instance is None: 1952 ↛ 1953line 1952 didn't jump to line 1953 because the condition on line 1952 was never true
1953 continue
1954 updates = {'vm_state': vm_states.ERROR, 'task_state': None}
1955 cell = cell_mapping_cache[instance.uuid]
1956 with try_target_cell(context, cell) as cctxt:
1957 self._set_vm_state_and_notify(cctxt, instance.uuid,
1958 'build_instances', updates, exc,
1959 request_spec)
1961 # In order to properly clean-up volumes when deleting a server in
1962 # ERROR status with no host, we need to store BDMs in the same
1963 # cell.
1964 if block_device_mappings: 1964 ↛ 1972line 1964 didn't jump to line 1972 because the condition on line 1964 was always true
1965 self._create_block_device_mapping(
1966 cell, instance.flavor, instance.uuid,
1967 block_device_mappings)
1969 # Like BDMs, the server tags provided by the user when creating the
1970 # server should be persisted in the same cell so they can be shown
1971 # from the API.
1972 if tags: 1972 ↛ 1983line 1972 didn't jump to line 1983 because the condition on line 1972 was always true
1973 with nova_context.target_cell(context, cell) as cctxt:
1974 self._create_tags(cctxt, instance.uuid, tags)
1976 # NOTE(mdbooth): To avoid an incomplete instance record being
1977 # returned by the API, the instance mapping must be
1978 # created after the instance record is complete in
1979 # the cell, and before the build request is
1980 # destroyed.
1981 # TODO(mnaser): The cell mapping should already be populated by
1982 # this point to avoid setting it below here.
1983 inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
1984 context, instance.uuid)
1985 inst_mapping.cell_mapping = cell
1986 inst_mapping.save()
1988 # Be paranoid about artifacts being deleted underneath us.
1989 try:
1990 build_request.destroy()
1991 except exception.BuildRequestNotFound:
1992 pass
1993 try:
1994 request_spec.destroy()
1995 except exception.RequestSpecNotFound:
1996 pass
1998 def _delete_build_request(self, context, build_request, instance, cell,
1999 instance_bdms, instance_tags):
2000 """Delete a build request after creating the instance in the cell.
2002 This method handles cleaning up the instance in case the build request
2003 is already deleted by the time we try to delete it.
2005 :param context: the context of the request being handled
2006 :type context: nova.context.RequestContext
2007 :param build_request: the build request to delete
2008 :type build_request: nova.objects.BuildRequest
2009 :param instance: the instance created from the build_request
2010 :type instance: nova.objects.Instance
2011 :param cell: the cell in which the instance was created
2012 :type cell: nova.objects.CellMapping
2013 :param instance_bdms: list of block device mappings for the instance
2014 :type instance_bdms: nova.objects.BlockDeviceMappingList
2015 :param instance_tags: list of tags for the instance
2016 :type instance_tags: nova.objects.TagList
2017 :returns: True if the build request was successfully deleted, False if
2018 the build request was already deleted and the instance is now gone.
2019 """
2020 try:
2021 build_request.destroy()
2022 except exception.BuildRequestNotFound:
2023 # This indicates an instance deletion request has been
2024 # processed, and the build should halt here. Clean up the
2025 # bdm, tags and instance record.
2026 with obj_target_cell(instance, cell) as cctxt:
2027 with compute_utils.notify_about_instance_delete(
2028 self.notifier, cctxt, instance,
2029 source=fields.NotificationSource.CONDUCTOR):
2030 try:
2031 instance.destroy()
2032 except exception.InstanceNotFound:
2033 pass
2034 except exception.ObjectActionError:
2035 # NOTE(melwitt): Instance became scheduled during
2036 # the destroy, "host changed". Refresh and re-destroy.
2037 try:
2038 instance.refresh()
2039 instance.destroy()
2040 except exception.InstanceNotFound:
2041 pass
2042 for bdm in instance_bdms:
2043 with obj_target_cell(bdm, cell):
2044 try:
2045 bdm.destroy()
2046 except exception.ObjectActionError:
2047 pass
2048 if instance_tags: 2048 ↛ 2054line 2048 didn't jump to line 2054 because the condition on line 2048 was always true
2049 with try_target_cell(context, cell) as target_ctxt:
2050 try:
2051 objects.TagList.destroy(target_ctxt, instance.uuid)
2052 except exception.InstanceNotFound:
2053 pass
2054 return False
2055 return True
2057 def cache_images(self, context, aggregate, image_ids):
2058 """Cache a set of images on the set of hosts in an aggregate.
2060 :param context: The RequestContext
2061 :param aggregate: The Aggregate object from the request to constrain
2062 the host list
2063 :param image_id: The IDs of the image to cache
2064 """
2066 # TODO(mriedem): Consider including the list of images in the
2067 # notification payload.
2068 compute_utils.notify_about_aggregate_action(
2069 context, aggregate,
2070 fields.NotificationAction.IMAGE_CACHE,
2071 fields.NotificationPhase.START)
2073 clock = timeutils.StopWatch()
2074 threads = CONF.image_cache.precache_concurrency
2075 fetch_pool = eventlet.GreenPool(size=threads)
2077 hosts_by_cell = {}
2078 cells_by_uuid = {}
2079 # TODO(danms): Make this a much more efficient bulk query
2080 for hostname in aggregate.hosts:
2081 hmap = objects.HostMapping.get_by_host(context, hostname)
2082 cells_by_uuid.setdefault(hmap.cell_mapping.uuid, hmap.cell_mapping)
2083 hosts_by_cell.setdefault(hmap.cell_mapping.uuid, [])
2084 hosts_by_cell[hmap.cell_mapping.uuid].append(hostname)
2086 LOG.info('Preparing to request pre-caching of image(s) %(image_ids)s '
2087 'on %(hosts)i hosts across %(cells)i cells.',
2088 {'image_ids': ','.join(image_ids),
2089 'hosts': len(aggregate.hosts),
2090 'cells': len(hosts_by_cell)})
2091 clock.start()
2093 stats = collections.defaultdict(lambda: (0, 0, 0, 0))
2094 failed_images = collections.defaultdict(int)
2095 down_hosts = set()
2096 host_stats = {
2097 'completed': 0,
2098 'total': len(aggregate.hosts),
2099 }
2101 def host_completed(context, host, result):
2102 for image_id, status in result.items():
2103 cached, existing, error, unsupported = stats[image_id]
2104 if status == 'error':
2105 failed_images[image_id] += 1
2106 error += 1
2107 elif status == 'cached': 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true
2108 cached += 1
2109 elif status == 'existing': 2109 ↛ 2110line 2109 didn't jump to line 2110 because the condition on line 2109 was never true
2110 existing += 1
2111 elif status == 'unsupported':
2112 unsupported += 1
2113 stats[image_id] = (cached, existing, error, unsupported)
2115 host_stats['completed'] += 1
2116 compute_utils.notify_about_aggregate_cache(context, aggregate,
2117 host, result,
2118 host_stats['completed'],
2119 host_stats['total'])
2121 def wrap_cache_images(ctxt, host, image_ids):
2122 result = self.compute_rpcapi.cache_images(
2123 ctxt,
2124 host=host,
2125 image_ids=image_ids)
2126 host_completed(context, host, result)
2128 def skipped_host(context, host, image_ids):
2129 result = {image: 'skipped' for image in image_ids}
2130 host_completed(context, host, result)
2132 for cell_uuid, hosts in hosts_by_cell.items():
2133 cell = cells_by_uuid[cell_uuid]
2134 with nova_context.target_cell(context, cell) as target_ctxt:
2135 for host in hosts:
2136 service = objects.Service.get_by_compute_host(target_ctxt,
2137 host)
2138 if not self.servicegroup_api.service_is_up(service):
2139 down_hosts.add(host)
2140 LOG.info(
2141 'Skipping image pre-cache request to compute '
2142 '%(host)r because it is not up',
2143 {'host': host})
2144 skipped_host(target_ctxt, host, image_ids)
2145 continue
2147 utils.pass_context(fetch_pool.spawn_n, wrap_cache_images,
2148 target_ctxt, host, image_ids)
2150 # Wait until all those things finish
2151 fetch_pool.waitall()
2153 overall_stats = {'cached': 0, 'existing': 0, 'error': 0,
2154 'unsupported': 0}
2155 for cached, existing, error, unsupported in stats.values():
2156 overall_stats['cached'] += cached
2157 overall_stats['existing'] += existing
2158 overall_stats['error'] += error
2159 overall_stats['unsupported'] += unsupported
2161 clock.stop()
2162 LOG.info('Image pre-cache operation for image(s) %(image_ids)s '
2163 'completed in %(time).2f seconds; '
2164 '%(cached)i cached, %(existing)i existing, %(error)i errors, '
2165 '%(unsupported)i unsupported, %(skipped)i skipped (down) '
2166 'hosts',
2167 {'image_ids': ','.join(image_ids),
2168 'time': clock.elapsed(),
2169 'cached': overall_stats['cached'],
2170 'existing': overall_stats['existing'],
2171 'error': overall_stats['error'],
2172 'unsupported': overall_stats['unsupported'],
2173 'skipped': len(down_hosts),
2174 })
2175 # Log error'd images specifically at warning level
2176 for image_id, fails in failed_images.items():
2177 LOG.warning('Image pre-cache operation for image %(image)s '
2178 'failed %(fails)i times',
2179 {'image': image_id,
2180 'fails': fails})
2182 compute_utils.notify_about_aggregate_action(
2183 context, aggregate,
2184 fields.NotificationAction.IMAGE_CACHE,
2185 fields.NotificationPhase.END)
2187 @targets_cell
2188 @wrap_instance_event(prefix='conductor')
2189 def confirm_snapshot_based_resize(self, context, instance, migration):
2190 """Executes the ConfirmResizeTask
2192 :param context: nova auth request context targeted at the target cell
2193 :param instance: Instance object in "resized" status from the target
2194 cell
2195 :param migration: Migration object from the target cell for the resize
2196 operation expected to have status "confirming"
2197 """
2198 task = cross_cell_migrate.ConfirmResizeTask(
2199 context, instance, migration, self.notifier, self.compute_rpcapi)
2200 task.execute()
2202 @targets_cell
2203 # NOTE(mriedem): Upon successful completion of RevertResizeTask the
2204 # instance is hard-deleted, along with its instance action record(s), from
2205 # the target cell database so EventReporter hits InstanceActionNotFound on
2206 # __exit__. Pass graceful_exit=True to avoid an ugly traceback.
2207 @wrap_instance_event(prefix='conductor', graceful_exit=True)
2208 def revert_snapshot_based_resize(self, context, instance, migration):
2209 """Executes the RevertResizeTask
2211 :param context: nova auth request context targeted at the target cell
2212 :param instance: Instance object in "resized" status from the target
2213 cell
2214 :param migration: Migration object from the target cell for the resize
2215 operation expected to have status "reverting"
2216 """
2217 task = cross_cell_migrate.RevertResizeTask(
2218 context, instance, migration, self.notifier, self.compute_rpcapi)
2219 task.execute()