Coverage for nova/compute/manager.py: 93%
4979 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
1# Copyright 2010 United States Government as represented by the
2# Administrator of the National Aeronautics and Space Administration.
3# Copyright 2011 Justin Santa Barbara
4# All Rights Reserved.
5#
6# Licensed under the Apache License, Version 2.0 (the "License"); you may
7# not use this file except in compliance with the License. You may obtain
8# a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15# License for the specific language governing permissions and limitations
16# under the License.
18"""Handles all processes relating to instances (guest vms).
20The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
21handles RPC calls relating to creating instances. It is responsible for
22building a disk image, launching it via the underlying virtualization driver,
23responding to calls to check its state, attaching persistent storage, and
24terminating it.
26"""
28import base64
29import binascii
30import contextlib
31import copy
32import functools
33import inspect
34import math
35import sys
36import time
37import traceback
38import typing as ty
40from cinderclient import exceptions as cinder_exception
41from cursive import exception as cursive_exception
42import eventlet.event
43from eventlet import greenthread
44import eventlet.semaphore
45import eventlet.timeout
46import futurist
47from keystoneauth1 import exceptions as keystone_exception
48from openstack import exceptions as sdk_exc
49import os_traits
50from oslo_log import log as logging
51import oslo_messaging as messaging
52from oslo_serialization import jsonutils
53from oslo_service import loopingcall
54from oslo_service import periodic_task
55from oslo_utils import excutils
56from oslo_utils import strutils
57from oslo_utils import timeutils
58from oslo_utils import units
60from nova.accelerator import cyborg
61from nova import block_device
62from nova.compute import api as compute
63from nova.compute import build_results
64from nova.compute import claims
65from nova.compute import power_state
66from nova.compute import resource_tracker
67from nova.compute import rpcapi as compute_rpcapi
68from nova.compute import task_states
69from nova.compute import utils as compute_utils
70from nova.compute.utils import wrap_instance_event
71from nova.compute import vm_states
72from nova import conductor
73import nova.conf
74import nova.context
75from nova import exception
76from nova import exception_wrapper
77from nova.i18n import _
78from nova.image import glance
79from nova import manager
80from nova.network import model as network_model
81from nova.network import neutron
82from nova import objects
83from nova.objects import base as obj_base
84from nova.objects import external_event as external_event_obj
85from nova.objects import fields
86from nova.objects import instance as obj_instance
87from nova.objects import migrate_data as migrate_data_obj
88from nova.objects import service as service_obj
89from nova.pci import request as pci_req_module
90from nova.pci import whitelist
91from nova import safe_utils
92from nova.scheduler.client import query
93from nova.scheduler.client import report
94from nova.scheduler import utils as scheduler_utils
95from nova.share import manila
96from nova import utils
97from nova.virt import block_device as driver_block_device
98from nova.virt import configdrive
99from nova.virt import driver
100from nova.virt import event as virtevent
101from nova.virt import hardware
102import nova.virt.node
103from nova.virt import storage_users
104from nova.virt import virtapi
105from nova.volume import cinder
107CONF = nova.conf.CONF
109LOG = logging.getLogger(__name__)
111wrap_exception = functools.partial(
112 exception_wrapper.wrap_exception, service='compute', binary='nova-compute')
115@contextlib.contextmanager
116def errors_out_migration_ctxt(migration):
117 """Context manager to error out migration on failure."""
119 try:
120 yield
121 except Exception:
122 with excutils.save_and_reraise_exception():
123 if migration:
124 # We may have been passed None for our migration if we're
125 # receiving from an older client. The migration will be
126 # errored via the legacy path.
127 migration.status = 'error'
128 try:
129 migration.save()
130 except Exception:
131 LOG.debug(
132 'Error setting migration status for instance %s.',
133 migration.instance_uuid, exc_info=True)
136@utils.expects_func_args('migration')
137def errors_out_migration(function):
138 """Decorator to error out migration on failure."""
140 @functools.wraps(function)
141 def decorated_function(self, context, *args, **kwargs):
142 wrapped_func = safe_utils.get_wrapped_function(function)
143 keyed_args = inspect.getcallargs(wrapped_func, self, context,
144 *args, **kwargs)
145 migration = keyed_args['migration']
146 with errors_out_migration_ctxt(migration):
147 return function(self, context, *args, **kwargs)
149 return decorated_function
152@utils.expects_func_args('instance')
153def reverts_task_state(function):
154 """Decorator to revert task_state on failure."""
156 @functools.wraps(function)
157 def decorated_function(self, context, *args, **kwargs):
158 try:
159 return function(self, context, *args, **kwargs)
160 except exception.UnexpectedTaskStateError as e:
161 # Note(maoy): unexpected task state means the current
162 # task is preempted. Do not clear task state in this
163 # case.
164 with excutils.save_and_reraise_exception():
165 LOG.info("Task possibly preempted: %s",
166 e.format_message())
167 except Exception:
168 with excutils.save_and_reraise_exception():
169 wrapped_func = safe_utils.get_wrapped_function(function)
170 keyed_args = inspect.getcallargs(wrapped_func, self, context,
171 *args, **kwargs)
172 # NOTE(mriedem): 'instance' must be in keyed_args because we
173 # have utils.expects_func_args('instance') decorating this
174 # method.
175 instance = keyed_args['instance']
176 original_task_state = instance.task_state
177 try:
178 self._instance_update(context, instance, task_state=None)
179 LOG.info("Successfully reverted task state from %s on "
180 "failure for instance.",
181 original_task_state, instance=instance)
182 except exception.InstanceNotFound:
183 # We might delete an instance that failed to build shortly
184 # after it errored out this is an expected case and we
185 # should not trace on it.
186 pass
187 except Exception as e:
188 LOG.warning("Failed to revert task state for instance. "
189 "Error: %s", e, instance=instance)
191 return decorated_function
194@utils.expects_func_args('instance')
195def wrap_instance_fault(function):
196 """Wraps a method to catch exceptions related to instances.
198 This decorator wraps a method to catch any exceptions having to do with
199 an instance that may get thrown. It then logs an instance fault in the db.
200 """
202 @functools.wraps(function)
203 def decorated_function(self, context, *args, **kwargs):
204 try:
205 return function(self, context, *args, **kwargs)
206 except exception.InstanceNotFound:
207 raise
208 except Exception as e:
209 # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
210 # we will get a KeyError exception which will cover up the real
211 # exception. So, we update kwargs with the values from args first.
212 # then, we can get 'instance' from kwargs easily.
213 kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
215 with excutils.save_and_reraise_exception():
216 compute_utils.add_instance_fault_from_exc(context,
217 kwargs['instance'], e, sys.exc_info())
219 return decorated_function
222@utils.expects_func_args('image_id', 'instance')
223def delete_image_on_error(function):
224 """Used for snapshot related method to ensure the image created in
225 compute.api is deleted when an error occurs.
226 """
228 @functools.wraps(function)
229 def decorated_function(self, context, image_id, instance,
230 *args, **kwargs):
231 try:
232 return function(self, context, image_id, instance,
233 *args, **kwargs)
234 except Exception:
235 with excutils.save_and_reraise_exception():
236 compute_utils.delete_image(
237 context, instance, self.image_api, image_id,
238 log_exc_info=True)
240 return decorated_function
243# Each collection of events is a dict of eventlet Events keyed by a tuple of
244# event name and associated tag
245_InstanceEvents = ty.Dict[ty.Tuple[str, str], eventlet.event.Event]
248class InstanceEvents(object):
249 def __init__(self):
250 self._events: ty.Optional[ty.Dict[str, _InstanceEvents]] = {}
252 @staticmethod
253 def _lock_name(instance) -> str:
254 return '%s-%s' % (instance.uuid, 'events')
256 def prepare_for_instance_event(
257 self,
258 instance: 'objects.Instance',
259 name: str,
260 tag: str,
261 ) -> eventlet.event.Event:
262 """Prepare to receive an event for an instance.
264 This will register an event for the given instance that we will
265 wait on later. This should be called before initiating whatever
266 action will trigger the event. The resulting eventlet.event.Event
267 object should be wait()'d on to ensure completion.
269 :param instance: the instance for which the event will be generated
270 :param name: the name of the event we're expecting
271 :param tag: the tag associated with the event we're expecting
272 :returns: an event object that should be wait()'d on
273 """
274 @utils.synchronized(self._lock_name(instance))
275 def _create_or_get_event():
276 if self._events is None:
277 # NOTE(danms): We really should have a more specific error
278 # here, but this is what we use for our default error case
279 raise exception.NovaException(
280 'In shutdown, no new events can be scheduled')
282 instance_events = self._events.setdefault(instance.uuid, {})
283 return instance_events.setdefault((name, tag),
284 eventlet.event.Event())
285 LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
286 {'name': name, 'tag': tag}, instance=instance)
287 return _create_or_get_event()
289 def pop_instance_event(self, instance, event):
290 """Remove a pending event from the wait list.
292 This will remove a pending event from the wait list so that it
293 can be used to signal the waiters to wake up.
295 :param instance: the instance for which the event was generated
296 :param event: the nova.objects.external_event.InstanceExternalEvent
297 that describes the event
298 :returns: the eventlet.event.Event object on which the waiters
299 are blocked
300 """
301 no_events_sentinel = object()
302 no_matching_event_sentinel = object()
304 @utils.synchronized(self._lock_name(instance))
305 def _pop_event():
306 if self._events is None:
307 LOG.debug('Unexpected attempt to pop events during shutdown',
308 instance=instance)
309 return no_events_sentinel
310 events = self._events.get(instance.uuid)
311 if not events: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true
312 return no_events_sentinel
313 _event = events.pop((event.name, event.tag), None)
314 if not events: 314 ↛ 316line 314 didn't jump to line 316 because the condition on line 314 was always true
315 del self._events[instance.uuid]
316 if _event is None: 316 ↛ 317line 316 didn't jump to line 317 because the condition on line 316 was never true
317 return no_matching_event_sentinel
318 return _event
320 result = _pop_event()
321 if result is no_events_sentinel:
322 LOG.debug('No waiting events found dispatching %(event)s',
323 {'event': event.key},
324 instance=instance)
325 return None
326 elif result is no_matching_event_sentinel: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true
327 LOG.debug(
328 'No event matching %(event)s in %(events)s',
329 {
330 'event': event.key,
331 # mypy can't identify the none check in _pop_event
332 'events': self._events.get( # type: ignore
333 instance.uuid, {}).keys(),
334 },
335 instance=instance,
336 )
337 return None
338 else:
339 return result
341 def clear_events_for_instance(self, instance):
342 """Remove all pending events for an instance.
344 This will remove all events currently pending for an instance
345 and return them (indexed by event name).
347 :param instance: the instance for which events should be purged
348 :returns: a dictionary of {event_name: eventlet.event.Event}
349 """
350 @utils.synchronized(self._lock_name(instance))
351 def _clear_events():
352 if self._events is None:
353 LOG.debug('Unexpected attempt to clear events during shutdown',
354 instance=instance)
355 return dict()
356 # NOTE(danms): We have historically returned the raw internal
357 # format here, which is {event.key: [events, ...])} so just
358 # trivially convert it here.
359 return {'%s-%s' % k: e
360 for k, e in self._events.pop(instance.uuid, {}).items()}
361 return _clear_events()
363 def cancel_all_events(self):
364 if self._events is None:
365 LOG.debug('Unexpected attempt to cancel events during shutdown.')
366 return
367 our_events = self._events
368 # NOTE(danms): Block new events
369 self._events = None
371 for instance_uuid, events in our_events.items():
372 for (name, tag), eventlet_event in events.items():
373 LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
374 'instance %(instance_uuid)s',
375 {'name': name,
376 'tag': tag,
377 'instance_uuid': instance_uuid})
378 event = objects.InstanceExternalEvent(
379 instance_uuid=instance_uuid,
380 name=name, status='failed',
381 tag=tag, data={})
382 eventlet_event.send(event)
385class ComputeVirtAPI(virtapi.VirtAPI):
386 def __init__(self, compute):
387 super(ComputeVirtAPI, self).__init__()
388 self._compute = compute
389 self.reportclient = compute.reportclient
391 class ExitEarly(Exception):
392 def __init__(self, events):
393 super(Exception, self).__init__()
394 self.events = events
396 self._exit_early_exc = ExitEarly
398 def exit_wait_early(self, events):
399 """Exit a wait_for_instance_event() immediately and avoid
400 waiting for some events.
402 :param: events: A list of (name, tag) tuples for events that we should
403 skip waiting for during a wait_for_instance_event().
404 """
405 raise self._exit_early_exc(events=events)
407 def _default_error_callback(self, event_name, instance):
408 raise exception.NovaException(_('Instance event failed'))
410 class _InstanceEvent:
411 EXPECTED = "expected"
412 WAITING = "waiting"
413 RECEIVED = "received"
414 RECEIVED_EARLY = "received early"
415 TIMED_OUT = "timed out"
416 RECEIVED_NOT_PROCESSED = "received but not processed"
418 def __init__(self, name: str, event: eventlet.event.Event) -> None:
419 self.name = name
420 self.event = event
421 self.status = self.EXPECTED
422 self.wait_time = None
424 def mark_as_received_early(self) -> None:
425 self.status = self.RECEIVED_EARLY
427 def is_received_early(self) -> bool:
428 return self.status == self.RECEIVED_EARLY
430 def _update_status_no_wait(self):
431 if self.status == self.EXPECTED and self.event.ready():
432 self.status = self.RECEIVED_NOT_PROCESSED
434 def wait(self) -> 'objects.InstanceExternalEvent':
435 self.status = self.WAITING
436 try:
437 with timeutils.StopWatch() as sw:
438 instance_event = self.event.wait()
439 except eventlet.timeout.Timeout:
440 self.status = self.TIMED_OUT
441 self.wait_time = sw.elapsed()
443 raise
445 self.status = self.RECEIVED
446 self.wait_time = sw.elapsed()
447 return instance_event
449 def __str__(self) -> str:
450 self._update_status_no_wait()
451 if self.status == self.EXPECTED:
452 return f"{self.name}: expected but not received"
453 if self.status == self.RECEIVED:
454 return (
455 f"{self.name}: received after waiting "
456 f"{self.wait_time:.2f} seconds")
457 if self.status == self.TIMED_OUT:
458 return (
459 f"{self.name}: timed out after "
460 f"{self.wait_time:.2f} seconds")
461 return f"{self.name}: {self.status}"
463 @staticmethod
464 def _wait_for_instance_events(
465 instance: 'objects.Instance',
466 events: dict,
467 error_callback: ty.Callable,
468 ) -> None:
469 for event_name, event in events.items():
470 if event.is_received_early():
471 continue
472 else:
473 actual_event = event.wait()
474 if actual_event.status == 'completed':
475 continue
476 # If we get here, we have an event that was not completed,
477 # nor skipped via exit_wait_early(). Decide whether to
478 # keep waiting by calling the error_callback() hook.
479 decision = error_callback(event_name, instance)
480 if decision is False: 480 ↛ 481line 480 didn't jump to line 481 because the condition on line 480 was never true
481 break
483 @contextlib.contextmanager
484 def wait_for_instance_event(self, instance, event_names, deadline=300,
485 error_callback=None):
486 """Plan to wait for some events, run some code, then wait.
488 This context manager will first create plans to wait for the
489 provided event_names, yield, and then wait for all the scheduled
490 events to complete.
492 Note that this uses an eventlet.timeout.Timeout to bound the
493 operation, so callers should be prepared to catch that
494 failure and handle that situation appropriately.
496 If the event is not received by the specified timeout deadline,
497 eventlet.timeout.Timeout is raised.
499 If the event is received but did not have a 'completed'
500 status, a NovaException is raised. If an error_callback is
501 provided, instead of raising an exception as detailed above
502 for the failure case, the callback will be called with the
503 event_name and instance, and can return True to continue
504 waiting for the rest of the events, False to stop processing,
505 or raise an exception which will bubble up to the waiter.
507 If the inner code wishes to abort waiting for one or more
508 events because it knows some state to be finished or condition
509 to be satisfied, it can use VirtAPI.exit_wait_early() with a
510 list of event (name,tag) items to avoid waiting for those
511 events upon context exit. Note that exit_wait_early() exits
512 the context immediately and should be used to signal that all
513 work has been completed and provide the unified list of events
514 that need not be waited for. Waiting for the remaining events
515 will begin immediately upon early exit as if the context was
516 exited normally.
518 :param instance: The instance for which an event is expected
519 :param event_names: A list of event names. Each element is a
520 tuple of strings to indicate (name, tag),
521 where name is required, but tag may be None.
522 :param deadline: Maximum number of seconds we should wait for all
523 of the specified events to arrive.
524 :param error_callback: A function to be called if an event arrives
526 """
528 if error_callback is None:
529 error_callback = self._default_error_callback
530 events = {}
531 for event_name in event_names:
532 name, tag = event_name
533 event_name = objects.InstanceExternalEvent.make_key(name, tag)
534 try:
535 event = (
536 self._compute.instance_events.prepare_for_instance_event(
537 instance, name, tag))
538 events[event_name] = self._InstanceEvent(event_name, event)
539 except exception.NovaException:
540 error_callback(event_name, instance)
541 # NOTE(danms): Don't wait for any of the events. They
542 # should all be canceled and fired immediately below,
543 # but don't stick around if not.
544 deadline = 0
545 try:
546 yield
547 except self._exit_early_exc as e:
548 early_events = set([objects.InstanceExternalEvent.make_key(n, t)
549 for n, t in e.events])
551 # If there are expected events that received early, mark them,
552 # so they won't be waited for later
553 for early_event_name in early_events:
554 if early_event_name in events: 554 ↛ 553line 554 didn't jump to line 553 because the condition on line 554 was always true
555 events[early_event_name].mark_as_received_early()
557 sw = timeutils.StopWatch()
558 sw.start()
559 try:
560 with eventlet.timeout.Timeout(deadline):
561 self._wait_for_instance_events(
562 instance, events, error_callback)
563 except eventlet.timeout.Timeout:
564 LOG.warning(
565 'Timeout waiting for %(events)s for instance with '
566 'vm_state %(vm_state)s and task_state %(task_state)s. '
567 'Event states are: %(event_states)s',
568 {
569 'events': list(events.keys()),
570 'vm_state': instance.vm_state,
571 'task_state': instance.task_state,
572 'event_states':
573 ', '.join([str(event) for event in events.values()]),
574 },
575 instance=instance)
577 raise
579 LOG.debug('Instance event wait completed in %i seconds for %s',
580 sw.elapsed(),
581 ','.join(x[0] for x in event_names),
582 instance=instance)
584 def update_compute_provider_status(self, context, rp_uuid, enabled):
585 """Used to add/remove the COMPUTE_STATUS_DISABLED trait on the provider
587 :param context: nova auth RequestContext
588 :param rp_uuid: UUID of a compute node resource provider in Placement
589 :param enabled: True if the node is enabled in which case the trait
590 would be removed, False if the node is disabled in which case
591 the trait would be added.
592 :raises: ResourceProviderTraitRetrievalFailed
593 :raises: ResourceProviderUpdateConflict
594 :raises: ResourceProviderUpdateFailed
595 :raises: TraitRetrievalFailed
596 :raises: keystoneauth1.exceptions.ClientException
597 """
598 trait_name = os_traits.COMPUTE_STATUS_DISABLED
599 # Get the current traits (and generation) for the provider.
600 # TODO(mriedem): Leverage the ProviderTree cache in get_provider_traits
601 trait_info = self.reportclient.get_provider_traits(context, rp_uuid)
602 # If the host is enabled, remove the trait (if set), else add
603 # the trait if it doesn't already exist.
604 original_traits = trait_info.traits
605 new_traits = None
606 if enabled and trait_name in original_traits:
607 new_traits = original_traits - {trait_name}
608 LOG.debug('Removing trait %s from compute node resource '
609 'provider %s in placement.', trait_name, rp_uuid)
610 elif not enabled and trait_name not in original_traits:
611 new_traits = original_traits | {trait_name}
612 LOG.debug('Adding trait %s to compute node resource '
613 'provider %s in placement.', trait_name, rp_uuid)
615 if new_traits is not None:
616 self.reportclient.set_traits_for_provider(
617 context, rp_uuid, new_traits, generation=trait_info.generation)
620class ComputeManager(manager.Manager):
621 """Manages the running instances from creation to destruction."""
623 target = messaging.Target(version='6.4')
625 def __init__(self, compute_driver=None, *args, **kwargs):
626 """Load configuration options and connect to the hypervisor."""
627 # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all
628 # using the same instance of SchedulerReportClient which has the
629 # ProviderTree cache for this compute service.
630 # NOTE(danms): We do not use the global placement client
631 # singleton here, because the above-mentioned stack of objects
632 # maintain local state in the client. Thus, keeping our own
633 # private object for that stack avoids any potential conflict
634 # with other users in our process outside of the above.
635 self.reportclient = report.SchedulerReportClient()
636 self.virtapi = ComputeVirtAPI(self)
637 self.network_api = neutron.API()
638 self.volume_api = cinder.API()
639 self.manila_api = manila.API()
640 self.image_api = glance.API()
641 self._last_bw_usage_poll = 0.0
642 self.compute_api = compute.API()
643 self.compute_rpcapi = compute_rpcapi.ComputeAPI()
644 self.compute_task_api = conductor.ComputeTaskAPI()
645 self.query_client = query.SchedulerQueryClient()
646 self.instance_events = InstanceEvents()
647 self._sync_power_pool = eventlet.GreenPool(
648 size=CONF.sync_power_state_pool_size)
649 self._syncs_in_progress = {}
650 self.send_instance_updates = (
651 CONF.filter_scheduler.track_instance_changes)
652 if CONF.max_concurrent_builds != 0:
653 self._build_semaphore = eventlet.semaphore.Semaphore(
654 CONF.max_concurrent_builds)
655 else:
656 self._build_semaphore = compute_utils.UnlimitedSemaphore()
657 if CONF.max_concurrent_snapshots > 0:
658 self._snapshot_semaphore = eventlet.semaphore.Semaphore(
659 CONF.max_concurrent_snapshots)
660 else:
661 self._snapshot_semaphore = compute_utils.UnlimitedSemaphore()
662 if CONF.max_concurrent_live_migrations > 0:
663 self._live_migration_executor = futurist.GreenThreadPoolExecutor(
664 max_workers=CONF.max_concurrent_live_migrations)
665 else:
666 # CONF.max_concurrent_live_migrations is 0 (unlimited)
667 self._live_migration_executor = futurist.GreenThreadPoolExecutor()
668 # This is a dict, keyed by instance uuid, to a two-item tuple of
669 # migration object and Future for the queued live migration.
670 self._waiting_live_migrations = {}
672 super(ComputeManager, self).__init__(service_name="compute",
673 *args, **kwargs)
675 # TODO(sbauza): Remove this call once we delete the V5Proxy class
676 self.additional_endpoints.append(_ComputeV5Proxy(self))
678 # NOTE(russellb) Load the driver last. It may call back into the
679 # compute manager via the virtapi, so we want it to be fully
680 # initialized before that happens.
681 self.service_ref = None
682 self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
683 self.rt = resource_tracker.ResourceTracker(
684 self.host, self.driver, reportclient=self.reportclient)
686 def reset(self):
687 LOG.info('Reloading compute RPC API')
688 compute_rpcapi.reset_globals()
689 self.compute_rpcapi = compute_rpcapi.ComputeAPI()
690 self.reportclient.clear_provider_cache()
692 def _update_resource_tracker(self, context, instance):
693 """Let the resource tracker know that an instance has changed state."""
695 if instance.host == self.host:
696 self.rt.update_usage(context, instance, instance.node)
698 def _instance_update(self, context, instance, **kwargs):
699 """Update an instance in the database using kwargs as value."""
701 for k, v in kwargs.items():
702 setattr(instance, k, v)
703 instance.save()
704 self._update_resource_tracker(context, instance)
706 def _nil_out_instance_obj_host_and_node(self, instance):
707 # NOTE(jwcroppe): We don't do instance.save() here for performance
708 # reasons; a call to this is expected to be immediately followed by
709 # another call that does instance.save(), thus avoiding two writes
710 # to the database layer.
711 instance.host = None
712 instance.node = None
713 instance.compute_id = None
714 # ResourceTracker._set_instance_host_and_node also sets launched_on
715 # to the same value as host and is really only ever used by legacy
716 # nova-network code, but we should also null it out to avoid confusion
717 # if there is an instance in the database with no host set but
718 # launched_on is set. Note that we do not care about using launched_on
719 # as some kind of debug helper if diagnosing a build failure, that is
720 # what instance action events are for.
721 instance.launched_on = None
722 # If the instance is not on a host, it's not in an aggregate and
723 # therefore is not in an availability zone.
724 instance.availability_zone = None
726 def _set_instance_obj_error_state(self, instance, clean_task_state=False):
727 try:
728 instance.vm_state = vm_states.ERROR
729 if clean_task_state:
730 instance.task_state = None
731 instance.save()
732 except exception.InstanceNotFound:
733 LOG.debug('Instance has been destroyed from under us while '
734 'trying to set it to ERROR', instance=instance)
736 def _get_instances_on_driver(self, context, filters=None):
737 """Return a list of instance records for the instances found
738 on the hypervisor which satisfy the specified filters. If filters=None
739 return a list of instance records for all the instances found on the
740 hypervisor.
741 """
742 if not filters:
743 filters = {}
744 try:
745 driver_uuids = self.driver.list_instance_uuids()
746 if len(driver_uuids) == 0:
747 # Short circuit, don't waste a DB call
748 return objects.InstanceList()
749 filters['uuid'] = driver_uuids
750 local_instances = objects.InstanceList.get_by_filters(
751 context, filters, use_slave=True)
752 return local_instances
753 except NotImplementedError:
754 pass
756 # The driver doesn't support uuids listing, so we'll have
757 # to brute force.
758 driver_instances = self.driver.list_instances()
759 # NOTE(mjozefcz): In this case we need to apply host filter.
760 # Without this all instance data would be fetched from db.
761 filters['host'] = self.host
762 instances = objects.InstanceList.get_by_filters(context, filters,
763 use_slave=True)
764 name_map = {instance.name: instance for instance in instances}
765 local_instances = []
766 for driver_instance in driver_instances:
767 instance = name_map.get(driver_instance)
768 if not instance: 768 ↛ 769line 768 didn't jump to line 769 because the condition on line 768 was never true
769 continue
770 local_instances.append(instance)
771 return local_instances
773 def _destroy_evacuated_instances(self, context, node_cache):
774 """Destroys evacuated instances.
776 While nova-compute was down, the instances running on it could be
777 evacuated to another host. This method looks for evacuation migration
778 records where this is the source host and which were either started
779 (accepted), in-progress (pre-migrating) or migrated (done). From those
780 migration records, local instances reported by the hypervisor are
781 compared to the instances for the migration records and those local
782 guests are destroyed, along with instance allocation records in
783 Placement for this node.
784 Then allocations are removed from Placement for every instance that is
785 evacuated from this host regardless if the instance is reported by the
786 hypervisor or not.
788 :param context: The request context
789 :param node_cache: A dict of ComputeNode objects keyed by the UUID of
790 the compute node
791 :return: A dict keyed by instance uuid mapped to Migration objects
792 for instances that were migrated away from this host
793 """
794 filters = {
795 'source_compute': self.host,
796 # NOTE(mriedem): Migration records that have been accepted are
797 # included in case the source node comes back up while instances
798 # are being evacuated to another host. We don't want the same
799 # instance being reported from multiple hosts.
800 # NOTE(lyarwood): pre-migrating is also included here as the
801 # source compute can come back online shortly after the RT
802 # claims on the destination that in-turn moves the migration to
803 # pre-migrating. If the evacuate fails on the destination host,
804 # the user can rebuild the instance (in ERROR state) on the source
805 # host.
806 'status': ['accepted', 'pre-migrating', 'done'],
807 'migration_type': fields.MigrationType.EVACUATION,
808 }
809 with utils.temporary_mutation(context, read_deleted='yes'):
810 evacuations = objects.MigrationList.get_by_filters(context,
811 filters)
812 if not evacuations:
813 return {}
814 evacuations = {mig.instance_uuid: mig for mig in evacuations}
816 # TODO(mriedem): We could optimize by pre-loading the joined fields
817 # we know we'll use, like info_cache and flavor.
818 local_instances = self._get_instances_on_driver(context)
819 evacuated_local_instances = {inst.uuid: inst
820 for inst in local_instances
821 if inst.uuid in evacuations}
823 for instance in evacuated_local_instances.values():
824 LOG.info('Destroying instance as it has been evacuated from '
825 'this host but still exists in the hypervisor',
826 instance=instance)
827 try:
828 network_info = self.network_api.get_instance_nw_info(
829 context, instance)
830 bdi = self._get_instance_block_device_info(context,
831 instance)
832 evac = evacuations[instance.uuid]
833 destroy_disks = not (self._is_instance_storage_shared(
834 context, instance, host=evac.dest_compute))
835 except exception.InstanceNotFound:
836 network_info = network_model.NetworkInfo()
837 bdi = {}
838 LOG.info('Instance has been marked deleted already, '
839 'removing it from the hypervisor.',
840 instance=instance)
841 # always destroy disks if the instance was deleted
842 destroy_disks = True
843 self.driver.destroy(context, instance,
844 network_info,
845 bdi, destroy_disks)
847 hostname_to_cn_uuid = {
848 cn.hypervisor_hostname: cn.uuid
849 for cn in node_cache.values()}
851 for instance_uuid, migration in evacuations.items():
852 try:
853 if instance_uuid in evacuated_local_instances:
854 # Avoid the db call if we already have the instance loaded
855 # above
856 instance = evacuated_local_instances[instance_uuid]
857 else:
858 instance = objects.Instance.get_by_uuid(
859 context, instance_uuid)
860 except exception.InstanceNotFound:
861 # The instance already deleted so we expect that every
862 # allocation of that instance has already been cleaned up
863 continue
865 LOG.info('Cleaning up allocations of the instance as it has been '
866 'evacuated from this host',
867 instance=instance)
868 if migration.source_node not in hostname_to_cn_uuid:
869 LOG.error("Failed to clean allocation of evacuated "
870 "instance as the source node %s is not found",
871 migration.source_node, instance=instance)
872 continue
873 cn_uuid = hostname_to_cn_uuid[migration.source_node]
875 # If the instance was deleted in the interim, assume its
876 # allocations were properly cleaned up (either by its hosting
877 # compute service or the API).
878 if (not instance.deleted and 878 ↛ 882line 878 didn't jump to line 882 because the condition on line 878 was never true
879 not self.reportclient.
880 remove_provider_tree_from_instance_allocation(
881 context, instance.uuid, cn_uuid)):
882 LOG.error("Failed to clean allocation of evacuated instance "
883 "on the source node %s",
884 cn_uuid, instance=instance)
886 migration.status = 'completed'
887 migration.save()
888 return evacuations
890 def _is_instance_storage_shared(self, context, instance, host=None):
891 shared_storage = True
892 data = None
893 try:
894 data = self.driver.check_instance_shared_storage_local(context,
895 instance)
896 if data: 896 ↛ 910line 896 didn't jump to line 910 because the condition on line 896 was always true
897 shared_storage = (self.compute_rpcapi.
898 check_instance_shared_storage(context,
899 data, instance=instance, host=host))
900 except NotImplementedError:
901 LOG.debug('Hypervisor driver does not support '
902 'instance shared storage check, '
903 'assuming it\'s not on shared storage',
904 instance=instance)
905 shared_storage = False
906 except Exception:
907 LOG.exception('Failed to check if instance shared',
908 instance=instance)
909 finally:
910 if data:
911 self.driver.check_instance_shared_storage_cleanup(context,
912 data)
913 return shared_storage
915 def _complete_partial_deletion(self, context, instance):
916 """Complete deletion for instances in DELETED status but not marked as
917 deleted in the DB
918 """
919 instance.destroy()
920 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
921 context, instance.uuid)
922 self._complete_deletion(context,
923 instance)
924 self._notify_about_instance_usage(context, instance, "delete.end")
925 compute_utils.notify_about_instance_action(context, instance,
926 self.host, action=fields.NotificationAction.DELETE,
927 phase=fields.NotificationPhase.END, bdms=bdms)
929 def _complete_deletion(self, context, instance):
930 self._update_resource_tracker(context, instance)
932 # If we're configured to do deferred deletes, don't force deletion of
933 # allocations if there's a conflict.
934 force = False if CONF.reclaim_instance_interval > 0 else True
936 self.reportclient.delete_allocation_for_instance(context,
937 instance.uuid,
938 force=force)
940 self._clean_instance_console_tokens(context, instance)
941 self._delete_scheduler_instance_info(context, instance.uuid)
943 def _validate_pinning_configuration(self, instances):
944 if not self.driver.capabilities.get('supports_pcpus', False):
945 return
947 for instance in instances:
948 # ignore deleted instances
949 if instance.deleted:
950 continue
952 # if this is an unpinned instance and the host only has
953 # 'cpu_dedicated_set' configured, we need to tell the operator to
954 # correct their configuration
955 if not instance.numa_topology or (
956 instance.numa_topology.cpu_policy in (
957 None, fields.CPUAllocationPolicy.SHARED
958 )
959 ):
960 # we don't need to check 'vcpu_pin_set' since it can't coexist
961 # alongside 'cpu_dedicated_set'
962 if (CONF.compute.cpu_dedicated_set and
963 not CONF.compute.cpu_shared_set):
964 msg = _("This host has unpinned instances but has no CPUs "
965 "set aside for this purpose; configure '[compute] "
966 "cpu_shared_set' instead of, or in addition to, "
967 "'[compute] cpu_dedicated_set'")
968 raise exception.InvalidConfiguration(msg)
970 continue
972 # ditto for pinned instances if only 'cpu_shared_set' is configured
973 if (CONF.compute.cpu_shared_set and
974 not CONF.compute.cpu_dedicated_set and
975 not CONF.vcpu_pin_set):
976 msg = _("This host has pinned instances but has no CPUs "
977 "set aside for this purpose; configure '[compute] "
978 "cpu_dedicated_set' instead of, or in addition to, "
979 "'[compute] cpu_shared_set'.")
980 raise exception.InvalidConfiguration(msg)
982 # if this is a mixed instance with both pinned and unpinned CPUs,
983 # the host must have both 'cpu_dedicated_set' and 'cpu_shared_set'
984 # configured. check if 'cpu_shared_set' is set.
985 if (instance.numa_topology.cpu_policy ==
986 fields.CPUAllocationPolicy.MIXED and
987 not CONF.compute.cpu_shared_set):
988 msg = _("This host has mixed instance requesting both pinned "
989 "and unpinned CPUs but hasn't set aside unpinned CPUs "
990 "for this purpose; Configure "
991 "'[compute] cpu_shared_set'.")
992 raise exception.InvalidConfiguration(msg)
994 # for mixed instance check if 'cpu_dedicated_set' is set.
995 if (instance.numa_topology.cpu_policy == 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was never true
996 fields.CPUAllocationPolicy.MIXED and
997 not CONF.compute.cpu_dedicated_set):
998 msg = _("This host has mixed instance requesting both pinned "
999 "and unpinned CPUs but hasn't set aside pinned CPUs "
1000 "for this purpose; Configure "
1001 "'[compute] cpu_dedicated_set'")
1002 raise exception.InvalidConfiguration(msg)
1004 # also check to make sure the operator hasn't accidentally
1005 # dropped some cores that instances are currently using
1006 available_dedicated_cpus = (hardware.get_vcpu_pin_set() or
1007 hardware.get_cpu_dedicated_set())
1008 pinned_cpus = instance.numa_topology.cpu_pinning
1009 if available_dedicated_cpus and (
1010 pinned_cpus - available_dedicated_cpus):
1011 # we can't raise an exception because of bug #1289064,
1012 # which meant we didn't recalculate CPU pinning information
1013 # when we live migrated a pinned instance
1014 LOG.warning(
1015 "Instance is pinned to host CPUs %(cpus)s "
1016 "but one or more of these CPUs are not included in "
1017 "either '[compute] cpu_dedicated_set' or "
1018 "'vcpu_pin_set'; you should update these "
1019 "configuration options to include the missing CPUs "
1020 "or rebuild or cold migrate this instance.",
1021 {'cpus': list(pinned_cpus)},
1022 instance=instance)
1024 def _validate_vtpm_configuration(self, instances):
1025 if self.driver.capabilities.get('supports_vtpm', False):
1026 return
1028 for instance in instances:
1029 if instance.deleted: 1029 ↛ 1030line 1029 didn't jump to line 1030 because the condition on line 1029 was never true
1030 continue
1032 # NOTE(stephenfin): We don't have an attribute on the instance to
1033 # check for this, so we need to inspect the flavor/image metadata
1034 if hardware.get_vtpm_constraint(
1035 instance.flavor, instance.image_meta,
1036 ):
1037 msg = _(
1038 'This host has instances with the vTPM feature enabled, '
1039 'but the host is not correctly configured; enable '
1040 'vTPM support.'
1041 )
1042 raise exception.InvalidConfiguration(msg)
1044 def _reset_live_migration(self, context, instance):
1045 migration = None
1046 try:
1047 migration = objects.Migration.get_by_instance_and_status(
1048 context, instance.uuid, 'running')
1049 if migration: 1049 ↛ 1055line 1049 didn't jump to line 1055 because the condition on line 1049 was always true
1050 self.live_migration_abort(context, instance, migration.id)
1051 except Exception:
1052 LOG.exception('Failed to abort live-migration',
1053 instance=instance)
1054 finally:
1055 if migration: 1055 ↛ 1057line 1055 didn't jump to line 1057 because the condition on line 1055 was always true
1056 self._set_migration_status(migration, 'error')
1057 LOG.info('Instance found in migrating state during '
1058 'startup. Resetting task_state',
1059 instance=instance)
1060 instance.task_state = None
1061 instance.save(expected_task_state=[task_states.MIGRATING])
1063 def _init_instance(self, context, instance):
1064 """Initialize this instance during service init."""
1066 # NOTE(danms): If the instance appears to not be owned by this
1067 # host, it may have been evacuated away, but skipped by the
1068 # evacuation cleanup code due to configuration. Thus, if that
1069 # is a possibility, don't touch the instance in any way, but
1070 # log the concern. This will help avoid potential issues on
1071 # startup due to misconfiguration.
1072 if instance.host != self.host:
1073 LOG.warning('Instance %(uuid)s appears to not be owned '
1074 'by this host, but by %(host)s. Startup '
1075 'processing is being skipped.',
1076 {'uuid': instance.uuid,
1077 'host': instance.host})
1078 return
1080 # Instances that are shut down, or in an error state can not be
1081 # initialized and are not attempted to be recovered. The exception
1082 # to this are instances that are in RESIZE_MIGRATING or DELETING,
1083 # which are dealt with further down.
1084 if (instance.vm_state == vm_states.SOFT_DELETED or 1084 ↛ 1088line 1084 didn't jump to line 1088 because the condition on line 1084 was never true
1085 (instance.vm_state == vm_states.ERROR and
1086 instance.task_state not in
1087 (task_states.RESIZE_MIGRATING, task_states.DELETING))):
1088 LOG.debug("Instance is in %s state.",
1089 instance.vm_state, instance=instance)
1090 return
1092 if instance.vm_state == vm_states.DELETED:
1093 try:
1094 self._complete_partial_deletion(context, instance)
1095 except Exception:
1096 # we don't want that an exception blocks the init_host
1097 LOG.exception('Failed to complete a deletion',
1098 instance=instance)
1099 return
1101 if (instance.vm_state == vm_states.BUILDING or
1102 instance.task_state in [task_states.SCHEDULING,
1103 task_states.BLOCK_DEVICE_MAPPING,
1104 task_states.NETWORKING,
1105 task_states.SPAWNING]):
1106 # NOTE(dave-mcnally) compute stopped before instance was fully
1107 # spawned so set to ERROR state. This is safe to do as the state
1108 # may be set by the api but the host is not so if we get here the
1109 # instance has already been scheduled to this particular host.
1110 LOG.debug("Instance failed to spawn correctly, "
1111 "setting to ERROR state", instance=instance)
1112 self._set_instance_obj_error_state(instance, clean_task_state=True)
1113 return
1115 if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
1116 instance.task_state in [task_states.REBUILDING,
1117 task_states.REBUILD_BLOCK_DEVICE_MAPPING,
1118 task_states.REBUILD_SPAWNING]):
1119 # NOTE(jichenjc) compute stopped before instance was fully
1120 # spawned so set to ERROR state. This is consistent to BUILD
1121 LOG.debug("Instance failed to rebuild correctly, "
1122 "setting to ERROR state", instance=instance)
1123 self._set_instance_obj_error_state(instance, clean_task_state=True)
1124 return
1126 if (instance.vm_state != vm_states.ERROR and
1127 instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
1128 task_states.IMAGE_PENDING_UPLOAD,
1129 task_states.IMAGE_UPLOADING,
1130 task_states.IMAGE_SNAPSHOT]):
1131 LOG.debug("Instance in transitional state %s at start-up "
1132 "clearing task state",
1133 instance.task_state, instance=instance)
1134 instance.task_state = None
1135 instance.save()
1137 if (instance.vm_state != vm_states.ERROR and
1138 instance.task_state in [task_states.RESIZE_PREP]):
1139 LOG.debug("Instance in transitional state %s at start-up "
1140 "clearing task state",
1141 instance['task_state'], instance=instance)
1142 instance.task_state = None
1143 instance.save()
1145 if instance.task_state == task_states.DELETING:
1146 try:
1147 LOG.info('Service started deleting the instance during '
1148 'the previous run, but did not finish. Restarting'
1149 ' the deletion now.', instance=instance)
1150 instance.obj_load_attr('metadata')
1151 instance.obj_load_attr('system_metadata')
1152 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
1153 context, instance.uuid)
1154 self._delete_instance(context, instance, bdms)
1155 except Exception:
1156 # we don't want that an exception blocks the init_host
1157 LOG.exception('Failed to complete a deletion',
1158 instance=instance)
1159 self._set_instance_obj_error_state(instance)
1160 return
1162 current_power_state = self._get_power_state(instance)
1163 try_reboot, reboot_type = self._retry_reboot(
1164 instance, current_power_state)
1166 # NOTE(amorin)
1167 # If the instance is in power_state_SHUTDOWN, we will try_reboot
1168 if try_reboot:
1169 LOG.debug("Instance in transitional state (%(task_state)s) at "
1170 "start-up and power state is (%(power_state)s), "
1171 "triggering reboot",
1172 {'task_state': instance.task_state,
1173 'power_state': current_power_state},
1174 instance=instance)
1176 # NOTE(mikal): if the instance was doing a soft reboot that got as
1177 # far as shutting down the instance but not as far as starting it
1178 # again, then we've just become a hard reboot. That means the
1179 # task state for the instance needs to change so that we're in one
1180 # of the expected task states for a hard reboot.
1181 if (instance.task_state in task_states.soft_reboot_states and
1182 reboot_type == 'HARD'):
1183 instance.task_state = task_states.REBOOT_PENDING_HARD
1184 instance.save()
1186 self.reboot_instance(context, instance, block_device_info=None,
1187 reboot_type=reboot_type)
1188 return
1190 # NOTE(plestang): an instance might be in power_state.RUNNING with a
1191 # transient state when a host is brutally shutdown or rebooted while a
1192 # reboot/pause/unpause is scheduled on client side
1193 elif (current_power_state == power_state.RUNNING and
1194 instance.task_state in [task_states.REBOOT_STARTED,
1195 task_states.REBOOT_STARTED_HARD,
1196 task_states.REBOOTING_HARD,
1197 task_states.REBOOTING,
1198 task_states.PAUSING,
1199 task_states.UNPAUSING]):
1200 LOG.warning("Instance in transitional state "
1201 "(%(task_state)s) at start-up and power state "
1202 "is (%(power_state)s), clearing task state",
1203 {'task_state': instance.task_state,
1204 'power_state': current_power_state},
1205 instance=instance)
1206 instance.task_state = None
1207 instance.vm_state = vm_states.ACTIVE
1208 instance.save()
1209 elif (current_power_state == power_state.PAUSED and
1210 instance.task_state == task_states.UNPAUSING):
1211 LOG.warning("Instance in transitional state "
1212 "(%(task_state)s) at start-up and power state "
1213 "is (%(power_state)s), clearing task state "
1214 "and unpausing the instance",
1215 {'task_state': instance.task_state,
1216 'power_state': current_power_state},
1217 instance=instance)
1218 try:
1219 self.unpause_instance(context, instance)
1220 except NotImplementedError:
1221 # Some virt driver didn't support pause and unpause
1222 pass
1223 except Exception:
1224 LOG.exception('Failed to unpause instance', instance=instance)
1225 return
1227 if instance.task_state == task_states.POWERING_OFF:
1228 try:
1229 LOG.debug("Instance in transitional state %s at start-up "
1230 "retrying stop request",
1231 instance.task_state, instance=instance)
1232 self.stop_instance(context, instance, True)
1233 except Exception:
1234 # we don't want that an exception blocks the init_host
1235 LOG.exception('Failed to stop instance', instance=instance)
1236 return
1238 if instance.task_state == task_states.POWERING_ON:
1239 try:
1240 LOG.debug("Instance in transitional state %s at start-up "
1241 "retrying start request",
1242 instance.task_state, instance=instance)
1243 self.start_instance(context, instance)
1244 except Exception:
1245 # we don't want that an exception blocks the init_host
1246 LOG.exception('Failed to start instance', instance=instance)
1247 return
1249 net_info = instance.get_network_info()
1250 try:
1251 self.driver.plug_vifs(instance, net_info)
1252 except NotImplementedError as e:
1253 LOG.debug(e, instance=instance)
1254 except exception.VirtualInterfacePlugException:
1255 # NOTE(mriedem): If we get here, it could be because the vif_type
1256 # in the cache is "binding_failed" or "unbound".
1257 # The periodic task _heal_instance_info_cache checks for this
1258 # condition. It should fix this by binding the ports again when
1259 # it gets to this instance.
1260 LOG.exception('Virtual interface plugging failed for instance. '
1261 'The port binding:host_id may need to be manually '
1262 'updated.', instance=instance)
1263 self._set_instance_obj_error_state(instance)
1264 return
1265 except exception.PciDeviceNotFoundById:
1266 # This is bug 1981813 where the bound port vnic_type has changed
1267 # from direct to macvtap. Nova does not support that and it
1268 # already printed an ERROR when the change is detected during
1269 # _heal_instance_info_cache. Now we print an ERROR again and skip
1270 # plugging the vifs but let the service startup continue to init
1271 # the other instances
1272 LOG.exception(
1273 'Virtual interface plugging failed for instance. Probably the '
1274 'vnic_type of the bound port has been changed. Nova does not '
1275 'support such change.',
1276 instance=instance
1277 )
1278 return
1280 if instance.task_state == task_states.RESIZE_MIGRATING:
1281 # We crashed during resize/migration, so roll back for safety
1282 try:
1283 # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
1284 # not in system_metadata we default to True for backwards
1285 # compatibility
1286 power_on = (instance.system_metadata.get('old_vm_state') !=
1287 vm_states.STOPPED)
1289 block_dev_info = self._get_instance_block_device_info(context,
1290 instance)
1292 migration = objects.Migration.get_by_id_and_instance(
1293 context, instance.migration_context.migration_id,
1294 instance.uuid)
1295 self.driver.finish_revert_migration(context, instance,
1296 net_info, migration, block_dev_info, power_on)
1298 except Exception:
1299 LOG.exception('Failed to revert crashed migration',
1300 instance=instance)
1301 finally:
1302 LOG.info('Instance found in migrating state during '
1303 'startup. Resetting task_state',
1304 instance=instance)
1305 instance.task_state = None
1306 instance.save()
1307 if instance.task_state == task_states.MIGRATING:
1308 # Live migration did not complete, but instance is on this
1309 # host. Abort ongoing migration if still running and reset state.
1310 self._reset_live_migration(context, instance)
1312 db_state = instance.power_state
1313 drv_state = self._get_power_state(instance)
1314 expect_running = (db_state == power_state.RUNNING and
1315 drv_state != db_state)
1317 LOG.debug('Current state is %(drv_state)s, state in DB is '
1318 '%(db_state)s.',
1319 {'drv_state': drv_state, 'db_state': db_state},
1320 instance=instance)
1322 if expect_running and CONF.resume_guests_state_on_host_boot:
1323 self._resume_guests_state(context, instance, net_info)
1325 def _resume_guests_state(self, context, instance, net_info):
1326 LOG.info('Rebooting instance after nova-compute restart.',
1327 instance=instance)
1328 block_device_info = \
1329 self._get_instance_block_device_info(context, instance)
1331 share_info = self._get_share_info(context, instance)
1332 self._mount_all_shares(context, instance, share_info)
1334 try:
1335 self.driver.resume_state_on_host_boot(
1336 context, instance, net_info, share_info, block_device_info)
1337 except NotImplementedError:
1338 LOG.warning('Hypervisor driver does not support '
1339 'resume guests', instance=instance)
1340 except Exception:
1341 # NOTE(vish): The instance failed to resume, so we set the
1342 # instance to error and attempt to continue.
1343 LOG.warning('Failed to resume instance',
1344 instance=instance)
1345 self._set_instance_obj_error_state(instance)
1347 def _retry_reboot(self, instance, current_power_state):
1348 current_task_state = instance.task_state
1349 retry_reboot = False
1350 reboot_type = compute_utils.get_reboot_type(current_task_state,
1351 current_power_state)
1353 pending_soft = (
1354 current_task_state == task_states.REBOOT_PENDING and
1355 instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
1356 pending_hard = (
1357 current_task_state == task_states.REBOOT_PENDING_HARD and
1358 instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
1359 started_not_running = (current_task_state in
1360 [task_states.REBOOTING,
1361 task_states.REBOOTING_HARD,
1362 task_states.REBOOT_STARTED,
1363 task_states.REBOOT_STARTED_HARD] and
1364 current_power_state != power_state.RUNNING)
1366 if pending_soft or pending_hard or started_not_running:
1367 retry_reboot = True
1369 return retry_reboot, reboot_type
1371 def handle_lifecycle_event(self, event):
1372 LOG.info("VM %(state)s (Lifecycle Event)",
1373 {'state': event.get_name()},
1374 instance_uuid=event.get_instance_uuid())
1375 context = nova.context.get_admin_context(read_deleted='yes')
1376 vm_power_state = None
1377 event_transition = event.get_transition()
1378 if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
1379 vm_power_state = power_state.SHUTDOWN
1380 elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
1381 vm_power_state = power_state.RUNNING
1382 elif event_transition in (
1383 virtevent.EVENT_LIFECYCLE_PAUSED,
1384 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
1385 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
1386 vm_power_state = power_state.PAUSED
1387 elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
1388 vm_power_state = power_state.RUNNING
1389 elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
1390 vm_power_state = power_state.SUSPENDED
1391 else:
1392 LOG.warning("Unexpected lifecycle event: %d", event_transition)
1394 migrate_finish_statuses = {
1395 # This happens on the source node and indicates live migration
1396 # entered post-copy mode.
1397 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
1398 # Suspended for offline migration.
1399 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
1400 }
1402 expected_attrs = []
1403 if event_transition in migrate_finish_statuses:
1404 # Join on info_cache since that's needed in migrate_instance_start.
1405 expected_attrs.append('info_cache')
1406 instance = objects.Instance.get_by_uuid(context,
1407 event.get_instance_uuid(),
1408 expected_attrs=expected_attrs)
1410 # Note(lpetrut): The event may be delayed, thus not reflecting
1411 # the current instance power state. In that case, ignore the event.
1412 current_power_state = self._get_power_state(instance)
1413 if current_power_state == vm_power_state:
1414 LOG.debug('Synchronizing instance power state after lifecycle '
1415 'event "%(event)s"; current vm_state: %(vm_state)s, '
1416 'current task_state: %(task_state)s, current DB '
1417 'power_state: %(db_power_state)s, VM power_state: '
1418 '%(vm_power_state)s',
1419 {'event': event.get_name(),
1420 'vm_state': instance.vm_state,
1421 'task_state': instance.task_state,
1422 'db_power_state': instance.power_state,
1423 'vm_power_state': vm_power_state},
1424 instance_uuid=instance.uuid)
1425 self._sync_instance_power_state(context,
1426 instance,
1427 vm_power_state)
1429 # The following checks are for live migration. We want to activate
1430 # the port binding for the destination host before the live migration
1431 # is resumed on the destination host in order to reduce network
1432 # downtime. Otherwise the ports are bound to the destination host
1433 # in post_live_migration_at_destination.
1434 # TODO(danms): Explore options for using a different live migration
1435 # specific callback for this instead of piggy-backing on the
1436 # handle_lifecycle_event callback.
1437 if (instance.task_state == task_states.MIGRATING and
1438 event_transition in migrate_finish_statuses):
1439 status = migrate_finish_statuses[event_transition]
1440 try:
1441 migration = objects.Migration.get_by_instance_and_status(
1442 context, instance.uuid, status)
1443 LOG.debug('Binding ports to destination host: %s',
1444 migration.dest_compute, instance=instance)
1445 # For neutron, migrate_instance_start will activate the
1446 # destination host port bindings, if there are any created by
1447 # conductor before live migration started.
1448 self.network_api.migrate_instance_start(
1449 context, instance, migration)
1450 except exception.MigrationNotFoundByStatus:
1451 LOG.warning("Unable to find migration record with status "
1452 "'%s' for instance. Port binding will happen in "
1453 "post live migration.", status, instance=instance)
1455 def handle_events(self, event):
1456 if isinstance(event, virtevent.LifecycleEvent): 1456 ↛ 1463line 1456 didn't jump to line 1463 because the condition on line 1456 was always true
1457 try:
1458 self.handle_lifecycle_event(event)
1459 except exception.InstanceNotFound:
1460 LOG.debug("Event %s arrived for non-existent instance. The "
1461 "instance was probably deleted.", event)
1462 else:
1463 LOG.debug("Ignoring event %s", event)
1465 def init_virt_events(self):
1466 if CONF.workarounds.handle_virt_lifecycle_events:
1467 self.driver.register_event_listener(self.handle_events)
1468 else:
1469 # NOTE(mriedem): If the _sync_power_states periodic task is
1470 # disabled we should emit a warning in the logs.
1471 if CONF.sync_power_state_interval < 0: 1471 ↛ 1472line 1471 didn't jump to line 1472 because the condition on line 1471 was never true
1472 LOG.warning('Instance lifecycle events from the compute '
1473 'driver have been disabled. Note that lifecycle '
1474 'changes to an instance outside of the compute '
1475 'service will not be synchronized '
1476 'automatically since the _sync_power_states '
1477 'periodic task is also disabled.')
1478 else:
1479 LOG.info('Instance lifecycle events from the compute '
1480 'driver have been disabled. Note that lifecycle '
1481 'changes to an instance outside of the compute '
1482 'service will only be synchronized by the '
1483 '_sync_power_states periodic task.')
1485 def _get_nodes(self, context):
1486 """Queried the ComputeNode objects from the DB that are reported by the
1487 hypervisor.
1489 :param context: the request context
1490 :return: a dict of ComputeNode objects keyed by the UUID of the given
1491 node.
1492 """
1493 try:
1494 node_ids = self.driver.get_nodenames_by_uuid()
1495 except exception.VirtDriverNotReady:
1496 LOG.warning(
1497 "Virt driver is not ready. If this is the first time this "
1498 "service is starting on this host, then you can ignore "
1499 "this warning.")
1500 return {}
1502 nodes = objects.ComputeNodeList.get_all_by_uuids(context,
1503 list(node_ids.keys()))
1504 if not nodes:
1505 # NOTE(danms): This should only happen if the compute_id is
1506 # pre-provisioned on a host that has never started.
1507 LOG.warning('Compute nodes %s for host %s were not found in the '
1508 'database. If this is the first time this service is '
1509 'starting on this host, then you can ignore this '
1510 'warning.',
1511 list(node_ids.keys()), self.host)
1512 return {}
1514 for node in nodes:
1515 if node.hypervisor_hostname != node_ids.get(node.uuid):
1516 raise exception.InvalidConfiguration(
1517 ('My compute node %s has hypervisor_hostname %s '
1518 'but virt driver reports it should be %s. Possible '
1519 'rename detected, refusing to start!') % (
1520 node.uuid, node.hypervisor_hostname,
1521 node_ids.get(node.uuid)))
1523 return {n.uuid: n for n in nodes}
1525 def _ensure_existing_node_identity(self, service_ref):
1526 """If we are upgrading from an older service version, we need
1527 to write our node identity uuid (if not already done) based on
1528 nodes assigned to us in the database.
1529 """
1530 if 'ironic' in CONF.compute_driver.lower():
1531 # We do not persist a single local node identity for
1532 # ironic
1533 return
1535 if service_ref.version >= service_obj.NODE_IDENTITY_VERSION:
1536 # Already new enough, nothing to do here, but make sure that we
1537 # have a UUID file already, as this is not our first time starting.
1538 if nova.virt.node.read_local_node_uuid() is None:
1539 raise exception.InvalidConfiguration(
1540 ('No local node identity found, but this is not our '
1541 'first startup on this host. Refusing to start after '
1542 'potentially having lost that state!'))
1543 return
1545 if nova.virt.node.read_local_node_uuid():
1546 # We already have a local node identity, no migration needed
1547 return
1549 context = nova.context.get_admin_context()
1550 db_nodes = objects.ComputeNodeList.get_all_by_host(context, self.host)
1551 if not db_nodes:
1552 # This means we have no nodes in the database (that we
1553 # know of) and thus have no need to record an existing
1554 # UUID. That is probably strange, so log a warning.
1555 raise exception.InvalidConfiguration(
1556 ('Upgrading from service version %i but found no '
1557 'nodes in the database for host %s to persist '
1558 'locally; Possible rename detected, '
1559 'refusing to start!') % (
1560 service_ref.version, self.host))
1562 if len(db_nodes) > 1:
1563 # If this happens we can't do the right thing, so raise an
1564 # exception to abort host startup
1565 LOG.warning('Multiple nodes found in the database for host %s; '
1566 'unable to persist local node identity automatically')
1567 raise exception.InvalidConfiguration(
1568 'Multiple nodes found in database, manual node uuid '
1569 'configuration required')
1571 nova.virt.node.write_local_node_uuid(db_nodes[0].uuid)
1573 def _check_for_host_rename(self, nodes_by_uuid):
1574 if 'ironic' in CONF.compute_driver.lower():
1575 # Ironic (currently) rebalances nodes at various times, and as
1576 # such, nodes being discovered as assigned to this host with a
1577 # different hostname is not surprising. Skip this check for
1578 # ironic.
1579 return
1580 for node in nodes_by_uuid.values():
1581 if node.host != self.host:
1582 raise exception.InvalidConfiguration(
1583 'My node %s has host %r but my host is %r; '
1584 'Possible rename detected, refusing to start!' % (
1585 node.uuid, node.host, self.host))
1586 LOG.debug('Verified node %s matches my host %s',
1587 node.uuid, self.host)
1589 def _sanity_check_new_host(self):
1590 instances_on_hv = self.driver.list_instance_uuids()
1591 if len(instances_on_hv) > 0:
1592 # This means we have instances on our hypervisor, but we think
1593 # we are a new host (i.e. we created a new service record). That
1594 # likely means we're pointed at an empty database or the wrong
1595 # cell.
1596 raise exception.InvalidConfiguration(
1597 'My hypervisor has existing instances, but I appear to be '
1598 'a new service in this database. Possible database '
1599 'configuration error, refusing to start!')
1601 def init_host(self, service_ref):
1602 """Initialization for a standalone compute service."""
1604 if service_ref:
1605 # If we are an existing service, check to see if we need
1606 # to record a locally-persistent node identity because
1607 # we have upgraded from a previous version.
1608 self._ensure_existing_node_identity(service_ref)
1609 else:
1610 # If we are a new service (in the database), make sure we have no
1611 # instances on our hypervisor as we would expect.
1612 self._sanity_check_new_host()
1614 if CONF.pci.device_spec:
1615 # Simply loading the PCI passthrough spec will do a bunch of
1616 # validation that would otherwise wait until the PciDevTracker is
1617 # constructed when updating available resources for the compute
1618 # node(s) in the resource tracker, effectively killing that task.
1619 # So load up the spec when starting the compute service to
1620 # flush any invalid configuration early, so we can kill the service
1621 # if the configuration is wrong.
1622 whitelist.Whitelist(CONF.pci.device_spec)
1624 nova.conf.neutron.register_dynamic_opts(CONF)
1625 # Even if only libvirt uses them, make it available for all drivers
1626 nova.conf.devices.register_dynamic_opts(CONF)
1628 # Override the number of concurrent disk operations allowed if the
1629 # user has specified a limit.
1630 if CONF.compute.max_concurrent_disk_ops != 0: 1630 ↛ 1631line 1630 didn't jump to line 1631 because the condition on line 1630 was never true
1631 compute_utils.disk_ops_semaphore = \
1632 eventlet.semaphore.BoundedSemaphore(
1633 CONF.compute.max_concurrent_disk_ops)
1635 if CONF.compute.max_disk_devices_to_attach == 0:
1636 msg = _('[compute]max_disk_devices_to_attach has been set to 0, '
1637 'which will prevent instances from being able to boot. '
1638 'Set -1 for unlimited or set >= 1 to limit the maximum '
1639 'number of disk devices.')
1640 raise exception.InvalidConfiguration(msg)
1642 self.driver.init_host(host=self.host)
1644 # NOTE(gibi): At this point the compute_nodes of the resource tracker
1645 # has not been populated yet so we cannot rely on the resource tracker
1646 # here.
1647 context = nova.context.get_admin_context()
1648 nodes_by_uuid = self._get_nodes(context)
1650 # NOTE(danms): Check for a possible host rename and abort
1651 # startup before we start mucking with instances we think are
1652 # ours.
1653 self._check_for_host_rename(nodes_by_uuid)
1655 instances = objects.InstanceList.get_by_host(
1656 context, self.host,
1657 expected_attrs=['info_cache', 'metadata', 'numa_topology'])
1659 self.init_virt_events()
1661 self._validate_pinning_configuration(instances)
1662 self._validate_vtpm_configuration(instances)
1664 # NOTE(gibi): If ironic and vcenter virt driver slow start time
1665 # becomes problematic here then we should consider adding a config
1666 # option or a driver flag to tell us if we should thread
1667 # _destroy_evacuated_instances and
1668 # _error_out_instances_whose_build_was_interrupted out in the
1669 # background on startup
1670 try:
1671 # checking that instance was not already evacuated to other host
1672 evacuated_instances = self._destroy_evacuated_instances(
1673 context, nodes_by_uuid)
1675 # Initialise instances on the host that are not evacuating
1676 for instance in instances:
1677 if instance.uuid not in evacuated_instances:
1678 self._init_instance(context, instance)
1680 # NOTE(gibi): collect all the instance uuids that is in some way
1681 # was already handled above. Either by init_instance or by
1682 # _destroy_evacuated_instances. This way we can limit the scope of
1683 # the _error_out_instances_whose_build_was_interrupted call to look
1684 # only for instances that have allocations on this node and not
1685 # handled by the above calls.
1686 already_handled = {instance.uuid for instance in instances}.union(
1687 evacuated_instances)
1688 self._error_out_instances_whose_build_was_interrupted(
1689 context, already_handled, nodes_by_uuid.keys())
1691 finally:
1692 if instances:
1693 # We only send the instance info to the scheduler on startup
1694 # if there is anything to send, otherwise this host might
1695 # not be mapped yet in a cell and the scheduler may have
1696 # issues dealing with the information. Later changes to
1697 # instances on this host will update the scheduler, or the
1698 # _sync_scheduler_instance_info periodic task will.
1699 self._update_scheduler_instance_info(context, instances)
1701 def _error_out_instances_whose_build_was_interrupted(
1702 self, context, already_handled_instances, node_uuids):
1703 """If there are instances in BUILDING state that are not
1704 assigned to this host but have allocations in placement towards
1705 this compute that means the nova-compute service was
1706 restarted while those instances waited for the resource claim
1707 to finish and the _set_instance_host_and_node() to update the
1708 instance.host field. We need to push them to ERROR state here to
1709 prevent keeping them in BUILDING state forever.
1711 :param context: The request context
1712 :param already_handled_instances: The set of instance UUIDs that the
1713 host initialization process already handled in some way.
1714 :param node_uuids: The list of compute node uuids handled by this
1715 service
1716 """
1718 # Strategy:
1719 # 1) Get the allocations from placement for our compute node(s)
1720 # 2) Remove the already handled instances from the consumer list;
1721 # they are either already initialized or need to be skipped.
1722 # 3) Check which remaining consumer is an instance in BUILDING state
1723 # and push it to ERROR state.
1725 LOG.info(
1726 "Looking for unclaimed instances stuck in BUILDING status for "
1727 "nodes managed by this host")
1728 for cn_uuid in node_uuids:
1729 try:
1730 f = self.reportclient.get_allocations_for_resource_provider
1731 allocations = f(context, cn_uuid).allocations
1732 except (exception.ResourceProviderAllocationRetrievalFailed,
1733 keystone_exception.ClientException) as e:
1734 LOG.error(
1735 "Could not retrieve compute node resource provider %s and "
1736 "therefore unable to error out any instances stuck in "
1737 "BUILDING state. Error: %s", cn_uuid, str(e))
1738 continue
1740 not_handled_consumers = (set(allocations) -
1741 already_handled_instances)
1743 if not not_handled_consumers:
1744 continue
1746 filters = {
1747 'vm_state': vm_states.BUILDING,
1748 'uuid': not_handled_consumers
1749 }
1751 instances = objects.InstanceList.get_by_filters(
1752 context, filters, expected_attrs=[])
1754 for instance in instances:
1755 LOG.debug(
1756 "Instance spawn was interrupted before instance_claim, "
1757 "setting instance to ERROR state", instance=instance)
1758 self._set_instance_obj_error_state(
1759 instance, clean_task_state=True)
1761 def cleanup_host(self):
1762 self.driver.register_event_listener(None)
1763 self.instance_events.cancel_all_events()
1764 self.driver.cleanup_host(host=self.host)
1765 self._cleanup_live_migrations_in_pool()
1767 def _cleanup_live_migrations_in_pool(self):
1768 # Shutdown the pool so we don't get new requests.
1769 self._live_migration_executor.shutdown(wait=False)
1770 # For any queued migrations, cancel the migration and update
1771 # its status.
1772 for migration, future in self._waiting_live_migrations.values():
1773 # If we got here before the Future was submitted then we need
1774 # to move on since there isn't anything we can do.
1775 if future is None:
1776 continue
1777 if future.cancel(): 1777 ↛ 1782line 1777 didn't jump to line 1782 because the condition on line 1777 was always true
1778 self._set_migration_status(migration, 'cancelled')
1779 LOG.info('Successfully cancelled queued live migration.',
1780 instance_uuid=migration.instance_uuid)
1781 else:
1782 LOG.warning('Unable to cancel live migration.',
1783 instance_uuid=migration.instance_uuid)
1784 self._waiting_live_migrations.clear()
1786 def pre_start_hook(self, service_ref):
1787 """After the service is initialized, but before we fully bring
1788 the service up by listening on RPC queues, make sure to update
1789 our available resources (and indirectly our available nodes).
1790 """
1791 self.service_ref = service_ref
1792 self.rt.set_service_ref(service_ref)
1793 self.update_available_resource(nova.context.get_admin_context(),
1794 startup=True)
1796 def _get_power_state(self, instance):
1797 """Retrieve the power state for the given instance."""
1798 LOG.debug('Checking state', instance=instance)
1799 try:
1800 return self.driver.get_info(instance, use_cache=False).state
1801 except exception.InstanceNotFound:
1802 return power_state.NOSTATE
1804 def _await_block_device_map_created(self, context, vol_id):
1805 # TODO(yamahata): creating volume simultaneously
1806 # reduces creation time?
1807 # TODO(yamahata): eliminate dumb polling
1808 start = time.time()
1809 retries = CONF.block_device_allocate_retries
1810 # (1) if the configured value is 0, one attempt should be made
1811 # (2) if the configured value is > 0, then the total number attempts
1812 # is (retries + 1)
1813 attempts = 1
1814 if retries >= 1:
1815 attempts = retries + 1
1816 for attempt in range(1, attempts + 1):
1817 volume = self.volume_api.get(context, vol_id)
1818 volume_status = volume['status']
1819 if volume_status not in ['creating', 'downloading']:
1820 if volume_status == 'available':
1821 return attempt
1822 LOG.warning("Volume id: %(vol_id)s finished being "
1823 "created but its status is %(vol_status)s.",
1824 {'vol_id': vol_id,
1825 'vol_status': volume_status})
1826 break
1827 greenthread.sleep(CONF.block_device_allocate_retries_interval)
1828 raise exception.VolumeNotCreated(volume_id=vol_id,
1829 seconds=int(time.time() - start),
1830 attempts=attempt,
1831 volume_status=volume_status)
1833 def _decode_files(self, injected_files):
1834 """Base64 decode the list of files to inject."""
1835 if not injected_files:
1836 return []
1838 def _decode(f):
1839 path, contents = f
1840 # Py3 raises binascii.Error instead of TypeError as in Py27
1841 try:
1842 decoded = base64.b64decode(contents)
1843 return path, decoded
1844 except (TypeError, binascii.Error):
1845 raise exception.Base64Exception(path=path)
1847 return [_decode(f) for f in injected_files]
1849 def _validate_instance_group_policy(self, context, instance,
1850 scheduler_hints=None):
1852 if CONF.workarounds.disable_group_policy_check_upcall:
1853 return
1855 # NOTE(russellb) Instance group policy is enforced by the scheduler.
1856 # However, there is a race condition with the enforcement of
1857 # the policy. Since more than one instance may be scheduled at the
1858 # same time, it's possible that more than one instance with an
1859 # anti-affinity policy may end up here. It's also possible that
1860 # multiple instances with an affinity policy could end up on different
1861 # hosts. This is a validation step to make sure that starting the
1862 # instance here doesn't violate the policy.
1863 if scheduler_hints is not None:
1864 # only go through here if scheduler_hints is provided,
1865 # even if it is empty.
1866 group_hint = scheduler_hints.get('group')
1867 if not group_hint:
1868 return
1869 else:
1870 # The RequestSpec stores scheduler_hints as key=list pairs
1871 # so we need to check the type on the value and pull the
1872 # single entry out. The API request schema validates that
1873 # the 'group' hint is a single value.
1874 if isinstance(group_hint, list):
1875 group_hint = group_hint[0]
1876 try:
1877 group = objects.InstanceGroup.get_by_hint(
1878 context, group_hint
1879 )
1880 except exception.InstanceGroupNotFound:
1881 return
1882 else:
1883 # TODO(ganso): a call to DB can be saved by adding request_spec
1884 # to rpcapi payload of live_migration, pre_live_migration and
1885 # check_can_live_migrate_destination
1886 try:
1887 group = objects.InstanceGroup.get_by_instance_uuid(
1888 context, instance.uuid
1889 )
1890 except exception.InstanceGroupNotFound:
1891 return
1893 @utils.synchronized(group['uuid'])
1894 def _do_validation(context, instance, group):
1895 if group.policy and 'anti-affinity' == group.policy:
1897 # instances on host
1898 instances_uuids = objects.InstanceList.get_uuids_by_host(
1899 context, self.host)
1900 ins_on_host = set(instances_uuids)
1902 # instance param is just for logging, the nodename obtained is
1903 # not actually related to the instance at all
1904 nodename = self._get_nodename(instance)
1906 # instances being migrated to host
1907 migrations = (
1908 objects.MigrationList.get_in_progress_by_host_and_node(
1909 context, self.host, nodename))
1910 migration_vm_uuids = {mig.instance_uuid for mig in migrations}
1912 total_instances = migration_vm_uuids | ins_on_host
1914 # refresh group to get updated members within locked block
1915 group = objects.InstanceGroup.get_by_uuid(context,
1916 group['uuid'])
1917 members = set(group.members)
1918 # Determine the set of instance group members on this host
1919 # which are not the instance in question. This is used to
1920 # determine how many other members from the same anti-affinity
1921 # group can be on this host.
1922 members_on_host = (total_instances & members -
1923 set([instance.uuid]))
1924 rules = group.rules
1925 if rules and 'max_server_per_host' in rules: 1925 ↛ 1928line 1925 didn't jump to line 1928 because the condition on line 1925 was always true
1926 max_server = rules['max_server_per_host']
1927 else:
1928 max_server = 1
1929 if len(members_on_host) >= max_server:
1930 raise exception.GroupAffinityViolation(
1931 instance_uuid=instance.uuid, policy='Anti-affinity')
1933 # NOTE(ganso): The check for affinity below does not work and it
1934 # can easily be violated because the lock happens in different
1935 # compute hosts.
1936 # The only fix seems to be a DB lock to perform the check whenever
1937 # setting the host field to an instance.
1938 elif group.policy and 'affinity' == group.policy: 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true
1939 group_hosts = group.get_hosts(exclude=[instance.uuid])
1940 if group_hosts and self.host not in group_hosts:
1941 raise exception.GroupAffinityViolation(
1942 instance_uuid=instance.uuid, policy='Affinity')
1944 _do_validation(context, instance, group)
1946 def _log_original_error(self, exc_info, instance_uuid):
1947 LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
1948 exc_info=exc_info)
1950 @periodic_task.periodic_task
1951 def _check_instance_build_time(self, context):
1952 """Ensure that instances are not stuck in build."""
1953 timeout = CONF.instance_build_timeout
1954 if timeout == 0:
1955 return
1957 filters = {'vm_state': vm_states.BUILDING,
1958 'host': self.host}
1960 building_insts = objects.InstanceList.get_by_filters(context,
1961 filters, expected_attrs=[], use_slave=True)
1963 for instance in building_insts:
1964 if timeutils.is_older_than(instance.created_at, timeout):
1965 self._set_instance_obj_error_state(instance)
1966 LOG.warning("Instance build timed out. Set to error "
1967 "state.", instance=instance)
1969 def _check_instance_exists(self, instance):
1970 """Ensure an instance with the same name is not already present."""
1971 if self.driver.instance_exists(instance):
1972 raise exception.InstanceExists(name=instance.name)
1974 def _allocate_network_async(self, context, instance, requested_networks,
1975 security_groups, resource_provider_mapping,
1976 network_arqs):
1977 """Method used to allocate networks in the background.
1979 Broken out for testing.
1980 """
1981 # First check to see if we're specifically not supposed to allocate
1982 # networks because if so, we can exit early.
1983 if requested_networks and requested_networks.no_allocate:
1984 LOG.debug("Not allocating networking since 'none' was specified.",
1985 instance=instance)
1986 return network_model.NetworkInfo([])
1988 LOG.debug("Allocating IP information in the background.",
1989 instance=instance)
1990 retries = CONF.network_allocate_retries
1991 attempts = retries + 1
1992 retry_time = 1
1993 bind_host_id = self.driver.network_binding_host_id(context, instance)
1994 for attempt in range(1, attempts + 1): 1994 ↛ exitline 1994 didn't return from function '_allocate_network_async' because the loop on line 1994 didn't complete
1995 try:
1996 nwinfo = self.network_api.allocate_for_instance(
1997 context, instance,
1998 requested_networks=requested_networks,
1999 security_groups=security_groups,
2000 bind_host_id=bind_host_id,
2001 resource_provider_mapping=resource_provider_mapping,
2002 network_arqs=network_arqs)
2003 LOG.debug('Instance network_info: |%s|', nwinfo,
2004 instance=instance)
2005 instance.system_metadata['network_allocated'] = 'True'
2006 # NOTE(JoshNang) do not save the instance here, as it can cause
2007 # races. The caller shares a reference to instance and waits
2008 # for this async greenthread to finish before calling
2009 # instance.save().
2010 return nwinfo
2011 except Exception as e:
2012 log_info = {'attempt': attempt,
2013 'attempts': attempts}
2014 if attempt == attempts:
2015 LOG.exception('Instance failed network setup '
2016 'after %(attempts)d attempt(s)',
2017 log_info)
2018 raise e
2019 LOG.warning('Instance failed network setup '
2020 '(attempt %(attempt)d of %(attempts)d)',
2021 log_info, instance=instance)
2022 time.sleep(retry_time)
2023 retry_time *= 2
2024 if retry_time > 30:
2025 retry_time = 30
2026 # Not reached.
2028 def _build_networks_for_instance(self, context, instance,
2029 requested_networks, security_groups, resource_provider_mapping,
2030 network_arqs):
2032 # If we're here from a reschedule the network may already be allocated.
2033 if strutils.bool_from_string(
2034 instance.system_metadata.get('network_allocated', 'False')):
2035 # NOTE(alex_xu): The network_allocated is True means the network
2036 # resource already allocated at previous scheduling, and the
2037 # network setup is cleanup at previous. After rescheduling, the
2038 # network resource need setup on the new host.
2039 self.network_api.setup_instance_network_on_host(
2040 context, instance, instance.host)
2041 return self.network_api.get_instance_nw_info(context, instance)
2043 network_info = self._allocate_network(context, instance,
2044 requested_networks, security_groups,
2045 resource_provider_mapping,
2046 network_arqs)
2048 return network_info
2050 def _allocate_network(self, context, instance, requested_networks,
2051 security_groups, resource_provider_mapping,
2052 network_arqs):
2053 """Start network allocation asynchronously. Return an instance
2054 of NetworkInfoAsyncWrapper that can be used to retrieve the
2055 allocated networks when the operation has finished.
2056 """
2057 # NOTE(comstud): Since we're allocating networks asynchronously,
2058 # this task state has little meaning, as we won't be in this
2059 # state for very long.
2060 instance.vm_state = vm_states.BUILDING
2061 instance.task_state = task_states.NETWORKING
2062 instance.save(expected_task_state=[None])
2064 return network_model.NetworkInfoAsyncWrapper(
2065 self._allocate_network_async, context, instance,
2066 requested_networks, security_groups, resource_provider_mapping,
2067 network_arqs)
2069 def _default_root_device_name(self, instance, image_meta, root_bdm):
2070 """Gets a default root device name from the driver.
2072 :param nova.objects.Instance instance:
2073 The instance for which to get the root device name.
2074 :param nova.objects.ImageMeta image_meta:
2075 The metadata of the image of the instance.
2076 :param nova.objects.BlockDeviceMapping root_bdm:
2077 The description of the root device.
2078 :returns: str -- The default root device name.
2079 :raises: InternalError, TooManyDiskDevices
2080 """
2081 try:
2082 return self.driver.default_root_device_name(instance,
2083 image_meta,
2084 root_bdm)
2085 except NotImplementedError:
2086 return compute_utils.get_next_device_name(instance, [])
2088 def _default_device_names_for_instance(self, instance,
2089 root_device_name,
2090 *block_device_lists):
2091 """Default the missing device names in the BDM from the driver.
2093 :param nova.objects.Instance instance:
2094 The instance for which to get default device names.
2095 :param str root_device_name: The root device name.
2096 :param list block_device_lists: List of block device mappings.
2097 :returns: None
2098 :raises: InternalError, TooManyDiskDevices
2099 """
2100 try:
2101 self.driver.default_device_names_for_instance(instance,
2102 root_device_name,
2103 *block_device_lists)
2104 except NotImplementedError:
2105 compute_utils.default_device_names_for_instance(
2106 instance, root_device_name, *block_device_lists)
2108 def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
2109 """Get the next device name from the driver, based on the BDM.
2111 :param nova.objects.Instance instance:
2112 The instance whose volume is requesting a device name.
2113 :param nova.objects.BlockDeviceMappingList bdms:
2114 The block device mappings for the instance.
2115 :param nova.objects.BlockDeviceMapping block_device_obj:
2116 A block device mapping containing info about the requested block
2117 device.
2118 :returns: The next device name.
2119 :raises: InternalError, TooManyDiskDevices
2120 """
2121 # NOTE(ndipanov): Copy obj to avoid changing the original
2122 block_device_obj = block_device_obj.obj_clone()
2123 try:
2124 return self.driver.get_device_name_for_instance(
2125 instance, bdms, block_device_obj)
2126 except NotImplementedError:
2127 return compute_utils.get_device_name_for_instance(
2128 instance, bdms, block_device_obj.get("device_name"))
2130 def _default_block_device_names(self, instance, image_meta, block_devices):
2131 """Verify that all the devices have the device_name set. If not,
2132 provide a default name.
2134 It also ensures that there is a root_device_name and is set to the
2135 first block device in the boot sequence (boot_index=0).
2136 """
2137 root_bdm = block_device.get_root_bdm(block_devices)
2138 if not root_bdm:
2139 return
2141 # Get the root_device_name from the root BDM or the instance
2142 root_device_name = None
2143 update_root_bdm = False
2145 if root_bdm.device_name:
2146 root_device_name = root_bdm.device_name
2147 instance.root_device_name = root_device_name
2148 elif instance.root_device_name:
2149 root_device_name = instance.root_device_name
2150 root_bdm.device_name = root_device_name
2151 update_root_bdm = True
2152 else:
2153 root_device_name = self._default_root_device_name(instance,
2154 image_meta,
2155 root_bdm)
2157 instance.root_device_name = root_device_name
2158 root_bdm.device_name = root_device_name
2159 update_root_bdm = True
2161 if update_root_bdm:
2162 root_bdm.save()
2164 ephemerals = []
2165 swap = []
2166 block_device_mapping = []
2167 image = []
2169 for device in block_devices:
2170 if block_device.new_format_is_ephemeral(device):
2171 ephemerals.append(device)
2173 if block_device.new_format_is_swap(device):
2174 swap.append(device)
2176 if driver_block_device.is_block_device_mapping(device):
2177 block_device_mapping.append(device)
2179 if driver_block_device.is_local_image(device): 2179 ↛ 2180line 2179 didn't jump to line 2180 because the condition on line 2179 was never true
2180 image.append(device)
2182 self._default_device_names_for_instance(instance,
2183 root_device_name,
2184 image,
2185 ephemerals,
2186 swap,
2187 block_device_mapping)
2189 def _add_missing_dev_names(self, bdms, instance):
2190 for bdm in bdms:
2191 if bdm.device_name is not None:
2192 continue
2194 device_name = self._get_device_name_for_instance(instance,
2195 bdms, bdm)
2196 values = {'device_name': device_name}
2197 bdm.update(values)
2198 bdm.save()
2200 def _prep_block_device(self, context, instance, bdms):
2201 """Set up the block device for an instance with error logging."""
2202 try:
2203 self._add_missing_dev_names(bdms, instance)
2204 block_device_info = driver.get_block_device_info(instance, bdms)
2205 mapping = driver.block_device_info_get_mapping(block_device_info)
2206 driver_block_device.attach_block_devices(
2207 mapping, context, instance, self.volume_api, self.driver,
2208 wait_func=self._await_block_device_map_created)
2210 return block_device_info
2212 except exception.OverQuota as e:
2213 LOG.warning('Failed to create block device for instance due'
2214 ' to exceeding volume related resource quota.'
2215 ' Error: %s', e.message, instance=instance)
2216 raise
2218 except Exception as ex:
2219 LOG.exception('Instance failed block device setup',
2220 instance=instance)
2221 # InvalidBDM will eventually result in a BuildAbortException when
2222 # booting from volume, and will be recorded as an instance fault.
2223 # Maintain the original exception message which most likely has
2224 # useful details which the standard InvalidBDM error message lacks.
2225 raise exception.InvalidBDM(str(ex))
2227 def _update_instance_after_spawn(self, instance,
2228 vm_state=vm_states.ACTIVE):
2229 instance.power_state = self._get_power_state(instance)
2230 instance.vm_state = vm_state
2231 instance.task_state = None
2232 # NOTE(sean-k-mooney): configdrive.update_instance checks
2233 # instance.launched_at to determine if it is the first or
2234 # subsequent spawn of an instance. We need to call update_instance
2235 # first before setting instance.launched_at or instance.config_drive
2236 # will never be set to true based on the value of force_config_drive.
2237 # As a result the config drive will be lost on a hard reboot of the
2238 # instance even when force_config_drive=true. see bug #1835822.
2239 configdrive.update_instance(instance)
2240 instance.launched_at = timeutils.utcnow()
2242 def _update_scheduler_instance_info(self, context, instance):
2243 """Sends an InstanceList with created or updated Instance objects to
2244 the Scheduler client.
2246 In the case of init_host, the value passed will already be an
2247 InstanceList. Other calls will send individual Instance objects that
2248 have been created or resized. In this case, we create an InstanceList
2249 object containing that Instance.
2250 """
2251 if not self.send_instance_updates:
2252 return
2253 if isinstance(instance, obj_instance.Instance):
2254 instance = objects.InstanceList(objects=[instance])
2255 context = context.elevated()
2256 self.query_client.update_instance_info(context, self.host,
2257 instance)
2259 def _delete_scheduler_instance_info(self, context, instance_uuid):
2260 """Sends the uuid of the deleted Instance to the Scheduler client."""
2261 if not self.send_instance_updates:
2262 return
2263 context = context.elevated()
2264 self.query_client.delete_instance_info(context, self.host,
2265 instance_uuid)
2267 @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
2268 def _sync_scheduler_instance_info(self, context):
2269 if not self.send_instance_updates:
2270 return
2271 context = context.elevated()
2272 instances = objects.InstanceList.get_by_host(context, self.host,
2273 expected_attrs=[],
2274 use_slave=True)
2275 uuids = [instance.uuid for instance in instances]
2276 self.query_client.sync_instance_info(context, self.host, uuids)
2278 def _notify_about_instance_usage(self, context, instance, event_suffix,
2279 network_info=None, extra_usage_info=None,
2280 fault=None, best_effort=False):
2281 compute_utils.notify_about_instance_usage(
2282 self.notifier, context, instance, event_suffix,
2283 network_info=network_info,
2284 extra_usage_info=extra_usage_info, fault=fault,
2285 best_effort=best_effort)
2287 def _deallocate_network(self, context, instance,
2288 requested_networks=None):
2289 # If we were told not to allocate networks let's save ourselves
2290 # the trouble of calling the network API.
2291 if requested_networks and requested_networks.no_allocate:
2292 LOG.debug("Skipping network deallocation for instance since "
2293 "networking was not requested.", instance=instance)
2294 return
2296 LOG.debug('Deallocating network for instance', instance=instance)
2297 with timeutils.StopWatch() as timer:
2298 self.network_api.deallocate_for_instance(
2299 context, instance, requested_networks=requested_networks)
2300 # nova-network does an rpc call so we're OK tracking time spent here
2301 LOG.info('Took %0.2f seconds to deallocate network for instance.',
2302 timer.elapsed(), instance=instance)
2304 def _get_instance_block_device_info(self, context, instance,
2305 refresh_conn_info=False,
2306 bdms=None):
2307 """Transform block devices to the driver block_device format."""
2309 if bdms is None:
2310 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
2311 context, instance.uuid)
2312 block_device_info = driver.get_block_device_info(instance, bdms)
2314 if not refresh_conn_info:
2315 # if the block_device_mapping has no value in connection_info
2316 # (returned as None), don't include in the mapping
2317 block_device_info['block_device_mapping'] = [
2318 bdm for bdm in driver.block_device_info_get_mapping(
2319 block_device_info)
2320 if bdm.get('connection_info')]
2321 else:
2322 driver_block_device.refresh_conn_infos(
2323 driver.block_device_info_get_mapping(block_device_info),
2324 context, instance, self.volume_api, self.driver)
2326 return block_device_info
2328 def _build_failed(self, node):
2329 if CONF.compute.consecutive_build_service_disable_threshold:
2330 # NOTE(danms): Update our counter, but wait for the next
2331 # update_available_resource() periodic to flush it to the DB
2332 self.rt.build_failed(node)
2334 def _build_succeeded(self, node):
2335 self.rt.build_succeeded(node)
2337 @wrap_exception()
2338 @reverts_task_state
2339 @wrap_instance_fault
2340 def build_and_run_instance(self, context, instance, image, request_spec,
2341 filter_properties, accel_uuids, admin_password=None,
2342 injected_files=None, requested_networks=None,
2343 security_groups=None, block_device_mapping=None,
2344 node=None, limits=None, host_list=None):
2346 @utils.synchronized(instance.uuid)
2347 def _locked_do_build_and_run_instance(*args, **kwargs):
2348 # NOTE(danms): We grab the semaphore with the instance uuid
2349 # locked because we could wait in line to build this instance
2350 # for a while and we want to make sure that nothing else tries
2351 # to do anything with this instance while we wait.
2352 with self._build_semaphore:
2353 try:
2354 result = self._do_build_and_run_instance(*args, **kwargs)
2355 except Exception:
2356 # NOTE(mriedem): This should really only happen if
2357 # _decode_files in _do_build_and_run_instance fails, and
2358 # that's before a guest is spawned so it's OK to remove
2359 # allocations for the instance for this node from Placement
2360 # below as there is no guest consuming resources anyway.
2361 # The _decode_files case could be handled more specifically
2362 # but that's left for another day.
2363 result = build_results.FAILED
2364 raise
2365 finally:
2366 if result == build_results.FAILED:
2367 # Remove the allocation records from Placement for the
2368 # instance if the build failed. The instance.host is
2369 # likely set to None in _do_build_and_run_instance
2370 # which means if the user deletes the instance, it
2371 # will be deleted in the API, not the compute service.
2372 # Setting the instance.host to None in
2373 # _do_build_and_run_instance means that the
2374 # ResourceTracker will no longer consider this instance
2375 # to be claiming resources against it, so we want to
2376 # reflect that same thing in Placement. No need to
2377 # call this for a reschedule, as the allocations will
2378 # have already been removed in
2379 # self._do_build_and_run_instance().
2380 self.reportclient.delete_allocation_for_instance(
2381 context, instance.uuid, force=True)
2383 if result in (build_results.FAILED_BY_POLICY,
2384 build_results.RESCHEDULED_BY_POLICY):
2385 return
2386 if result in (build_results.FAILED,
2387 build_results.RESCHEDULED):
2388 self._build_failed(node)
2389 else:
2390 self._build_succeeded(node)
2392 # NOTE(danms): We spawn here to return the RPC worker thread back to
2393 # the pool. Since what follows could take a really long time, we don't
2394 # want to tie up RPC workers.
2395 utils.spawn_n(_locked_do_build_and_run_instance,
2396 context, instance, image, request_spec,
2397 filter_properties, admin_password, injected_files,
2398 requested_networks, security_groups,
2399 block_device_mapping, node, limits, host_list,
2400 accel_uuids)
2402 def _check_device_tagging(self, requested_networks, block_device_mapping):
2403 tagging_requested = False
2404 if requested_networks:
2405 for net in requested_networks:
2406 if 'tag' in net and net.tag is not None:
2407 tagging_requested = True
2408 break
2409 if block_device_mapping and not tagging_requested:
2410 for bdm in block_device_mapping:
2411 if 'tag' in bdm and bdm.tag is not None:
2412 tagging_requested = True
2413 break
2414 if (tagging_requested and
2415 not self.driver.capabilities.get('supports_device_tagging',
2416 False)):
2417 raise exception.BuildAbortException('Attempt to boot guest with '
2418 'tagged devices on host that '
2419 'does not support tagging.')
2421 def _check_trusted_certs(self, instance):
2422 if (instance.trusted_certs and
2423 not self.driver.capabilities.get('supports_trusted_certs',
2424 False)):
2425 raise exception.BuildAbortException(
2426 'Trusted image certificates provided on host that does not '
2427 'support certificate validation.')
2429 @wrap_exception()
2430 @reverts_task_state
2431 @wrap_instance_event(prefix='compute')
2432 @wrap_instance_fault
2433 def _do_build_and_run_instance(self, context, instance, image,
2434 request_spec, filter_properties, admin_password, injected_files,
2435 requested_networks, security_groups, block_device_mapping,
2436 node=None, limits=None, host_list=None, accel_uuids=None):
2438 try:
2439 LOG.debug('Starting instance...', instance=instance)
2440 instance.vm_state = vm_states.BUILDING
2441 instance.task_state = None
2442 instance.save(expected_task_state=
2443 (task_states.SCHEDULING, None))
2444 except exception.InstanceNotFound:
2445 msg = 'Instance disappeared before build.'
2446 LOG.debug(msg, instance=instance)
2447 return build_results.FAILED
2448 except exception.UnexpectedTaskStateError as e:
2449 LOG.debug(e.format_message(), instance=instance)
2450 return build_results.FAILED
2452 # b64 decode the files to inject:
2453 decoded_files = self._decode_files(injected_files)
2455 if limits is None:
2456 limits = {}
2458 if node is None:
2459 node = self._get_nodename(instance, refresh=True)
2461 try:
2462 with timeutils.StopWatch() as timer:
2463 self._build_and_run_instance(context, instance, image,
2464 decoded_files, admin_password, requested_networks,
2465 security_groups, block_device_mapping, node, limits,
2466 filter_properties, request_spec, accel_uuids)
2467 LOG.info('Took %0.2f seconds to build instance.',
2468 timer.elapsed(), instance=instance)
2469 return build_results.ACTIVE
2470 except exception.RescheduledException as e:
2471 retry = filter_properties.get('retry')
2472 if not retry:
2473 # no retry information, do not reschedule.
2474 LOG.debug("Retry info not present, will not reschedule",
2475 instance=instance)
2476 self._cleanup_allocated_networks(context, instance,
2477 requested_networks)
2478 compute_utils.add_instance_fault_from_exc(context,
2479 instance, e, sys.exc_info(),
2480 fault_message=e.kwargs['reason'])
2481 self._nil_out_instance_obj_host_and_node(instance)
2482 self._set_instance_obj_error_state(instance,
2483 clean_task_state=True)
2484 if isinstance(e, exception.RescheduledByPolicyException):
2485 return build_results.FAILED_BY_POLICY
2486 return build_results.FAILED
2487 LOG.debug(e.format_message(), instance=instance)
2488 # This will be used for logging the exception
2489 retry['exc'] = traceback.format_exception(*sys.exc_info())
2490 # This will be used for setting the instance fault message
2491 retry['exc_reason'] = e.kwargs['reason']
2493 self._cleanup_allocated_networks(context, instance,
2494 requested_networks)
2496 self._nil_out_instance_obj_host_and_node(instance)
2497 instance.task_state = task_states.SCHEDULING
2498 instance.save()
2499 # The instance will have already claimed resources from this host
2500 # before this build was attempted. Now that it has failed, we need
2501 # to unclaim those resources before casting to the conductor, so
2502 # that if there are alternate hosts available for a retry, it can
2503 # claim resources on that new host for the instance.
2504 self.reportclient.delete_allocation_for_instance(
2505 context, instance.uuid, force=True)
2507 self.compute_task_api.build_instances(context, [instance],
2508 image, filter_properties, admin_password,
2509 injected_files, requested_networks, security_groups,
2510 block_device_mapping, request_spec=request_spec,
2511 host_lists=[host_list])
2513 if isinstance(e, exception.RescheduledByPolicyException):
2514 return build_results.RESCHEDULED_BY_POLICY
2516 return build_results.RESCHEDULED
2517 except (exception.InstanceNotFound,
2518 exception.UnexpectedDeletingTaskStateError):
2519 msg = 'Instance disappeared during build.'
2520 LOG.debug(msg, instance=instance)
2521 self._cleanup_allocated_networks(context, instance,
2522 requested_networks)
2523 return build_results.FAILED
2524 except Exception as e:
2525 if isinstance(e, exception.BuildAbortException):
2526 LOG.error(e.format_message(), instance=instance)
2527 else:
2528 # Should not reach here.
2529 LOG.exception('Unexpected build failure, not rescheduling '
2530 'build.', instance=instance)
2531 self._cleanup_allocated_networks(context, instance,
2532 requested_networks)
2533 self._cleanup_volumes(context, instance,
2534 block_device_mapping, raise_exc=False)
2535 compute_utils.add_instance_fault_from_exc(context, instance,
2536 e, sys.exc_info())
2537 self._nil_out_instance_obj_host_and_node(instance)
2538 self._set_instance_obj_error_state(instance, clean_task_state=True)
2539 return build_results.FAILED
2541 @staticmethod
2542 def _get_scheduler_hints(filter_properties, request_spec=None):
2543 """Helper method to get scheduler hints.
2545 This method prefers to get the hints out of the request spec, but that
2546 might not be provided. Conductor will pass request_spec down to the
2547 first compute chosen for a build but older computes will not pass
2548 the request_spec to conductor's build_instances method for a
2549 a reschedule, so if we're on a host via a retry, request_spec may not
2550 be provided so we need to fallback to use the filter_properties
2551 to get scheduler hints.
2552 """
2553 hints = {}
2554 if request_spec is not None and 'scheduler_hints' in request_spec:
2555 hints = request_spec.scheduler_hints
2556 if not hints:
2557 hints = filter_properties.get('scheduler_hints') or {}
2558 return hints
2560 @staticmethod
2561 def _get_request_group_mapping(request_spec):
2562 """Return request group resource - provider mapping. This is currently
2563 used for Neutron ports that have resource request due to the port
2564 having QoS minimum bandwidth policy rule attached.
2566 :param request_spec: A RequestSpec object or None
2567 :returns: A dict keyed by RequestGroup requester_id, currently Neutron
2568 port_id, to resource provider UUID that provides resource for that
2569 RequestGroup. Or None if the request_spec was None.
2570 """
2571 # TODO(sbauza): Remove this conditional once we only support
2572 # RPC API 6.0
2573 if request_spec:
2574 return request_spec.get_request_group_mapping()
2575 else:
2576 return None
2578 def _build_and_run_instance(self, context, instance, image, injected_files,
2579 admin_password, requested_networks, security_groups,
2580 block_device_mapping, node, limits, filter_properties,
2581 request_spec=None, accel_uuids=None):
2583 image_name = image.get('name')
2584 self._notify_about_instance_usage(context, instance, 'create.start',
2585 extra_usage_info={'image_name': image_name})
2586 compute_utils.notify_about_instance_create(
2587 context, instance, self.host,
2588 phase=fields.NotificationPhase.START,
2589 bdms=block_device_mapping)
2591 # NOTE(mikal): cache the keystone roles associated with the instance
2592 # at boot time for later reference
2593 instance.system_metadata.update(
2594 {'boot_roles': ','.join(context.roles)})
2596 self._check_device_tagging(requested_networks, block_device_mapping)
2597 self._check_trusted_certs(instance)
2599 provider_mapping = self._get_request_group_mapping(request_spec)
2601 if provider_mapping:
2602 try:
2603 compute_utils.update_pci_request_with_placement_allocations(
2604 context,
2605 self.reportclient,
2606 instance.pci_requests.requests,
2607 provider_mapping,
2608 )
2609 except (exception.AmbiguousResourceProviderForPCIRequest,
2610 exception.UnexpectedResourceProviderNameForPCIRequest
2611 ) as e:
2612 raise exception.BuildAbortException(
2613 reason=str(e), instance_uuid=instance.uuid)
2615 # TODO(Luyao) cut over to get_allocs_for_consumer
2616 allocs = self.reportclient.get_allocations_for_consumer(
2617 context, instance.uuid)
2619 try:
2620 scheduler_hints = self._get_scheduler_hints(filter_properties,
2621 request_spec)
2622 with self.rt.instance_claim(context, instance, node, allocs,
2623 limits):
2624 # NOTE(russellb) It's important that this validation be done
2625 # *after* the resource tracker instance claim, as that is where
2626 # the host is set on the instance.
2627 self._validate_instance_group_policy(context, instance,
2628 scheduler_hints)
2629 image_meta = objects.ImageMeta.from_dict(image)
2631 with self._build_resources(context, instance,
2632 requested_networks, security_groups, image_meta,
2633 block_device_mapping, provider_mapping,
2634 accel_uuids) as resources:
2635 instance.vm_state = vm_states.BUILDING
2636 instance.task_state = task_states.SPAWNING
2637 # NOTE(JoshNang) This also saves the changes to the
2638 # instance from _allocate_network_async, as they aren't
2639 # saved in that function to prevent races.
2640 instance.save(expected_task_state=
2641 task_states.BLOCK_DEVICE_MAPPING)
2642 block_device_info = resources['block_device_info']
2643 network_info = resources['network_info']
2644 accel_info = resources['accel_info']
2645 LOG.debug('Start spawning the instance on the hypervisor.',
2646 instance=instance)
2647 with timeutils.StopWatch() as timer:
2648 self.driver.spawn(context, instance, image_meta,
2649 injected_files, admin_password,
2650 allocs, network_info=network_info,
2651 block_device_info=block_device_info,
2652 accel_info=accel_info)
2653 LOG.info('Took %0.2f seconds to spawn the instance on '
2654 'the hypervisor.', timer.elapsed(),
2655 instance=instance)
2656 except (exception.InstanceNotFound,
2657 exception.UnexpectedDeletingTaskStateError) as e:
2658 with excutils.save_and_reraise_exception():
2659 self._notify_about_instance_usage(context, instance,
2660 'create.error', fault=e)
2661 compute_utils.notify_about_instance_create(
2662 context, instance, self.host,
2663 phase=fields.NotificationPhase.ERROR, exception=e,
2664 bdms=block_device_mapping)
2665 except exception.ComputeResourcesUnavailable as e:
2666 LOG.debug(e.format_message(), instance=instance)
2667 self._notify_about_instance_usage(context, instance,
2668 'create.error', fault=e)
2669 compute_utils.notify_about_instance_create(
2670 context, instance, self.host,
2671 phase=fields.NotificationPhase.ERROR, exception=e,
2672 bdms=block_device_mapping)
2673 raise exception.RescheduledException(
2674 instance_uuid=instance.uuid, reason=e.format_message())
2675 except exception.BuildAbortException as e:
2676 with excutils.save_and_reraise_exception():
2677 LOG.debug(e.format_message(), instance=instance)
2678 self._notify_about_instance_usage(context, instance,
2679 'create.error', fault=e)
2680 compute_utils.notify_about_instance_create(
2681 context, instance, self.host,
2682 phase=fields.NotificationPhase.ERROR, exception=e,
2683 bdms=block_device_mapping)
2684 except exception.NoMoreFixedIps as e:
2685 LOG.warning('No more fixed IP to be allocated',
2686 instance=instance)
2687 self._notify_about_instance_usage(context, instance,
2688 'create.error', fault=e)
2689 compute_utils.notify_about_instance_create(
2690 context, instance, self.host,
2691 phase=fields.NotificationPhase.ERROR, exception=e,
2692 bdms=block_device_mapping)
2693 msg = _('Failed to allocate the network(s) with error %s, '
2694 'not rescheduling.') % e.format_message()
2695 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2696 reason=msg)
2697 except (exception.ExternalNetworkAttachForbidden,
2698 exception.VirtualInterfaceCreateException,
2699 exception.VirtualInterfaceMacAddressException,
2700 exception.FixedIpInvalidOnHost,
2701 exception.UnableToAutoAllocateNetwork,
2702 exception.NetworksWithQoSPolicyNotSupported) as e:
2703 LOG.exception('Failed to allocate network(s)',
2704 instance=instance)
2705 self._notify_about_instance_usage(context, instance,
2706 'create.error', fault=e)
2707 compute_utils.notify_about_instance_create(
2708 context, instance, self.host,
2709 phase=fields.NotificationPhase.ERROR, exception=e,
2710 bdms=block_device_mapping)
2711 msg = _('Failed to allocate the network(s), not rescheduling.')
2712 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2713 reason=msg)
2714 except (exception.FlavorDiskTooSmall,
2715 exception.FlavorMemoryTooSmall,
2716 exception.ImageNotActive,
2717 exception.ImageUnacceptable,
2718 exception.InvalidDiskInfo,
2719 exception.InvalidDiskFormat,
2720 cursive_exception.SignatureVerificationError,
2721 exception.CertificateValidationFailed,
2722 exception.VolumeEncryptionNotSupported,
2723 exception.InvalidInput,
2724 # TODO(mriedem): We should be validating RequestedVRamTooHigh
2725 # in the API during server create and rebuild.
2726 exception.RequestedVRamTooHigh) as e:
2727 self._notify_about_instance_usage(context, instance,
2728 'create.error', fault=e)
2729 compute_utils.notify_about_instance_create(
2730 context, instance, self.host,
2731 phase=fields.NotificationPhase.ERROR, exception=e,
2732 bdms=block_device_mapping)
2733 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2734 reason=e.format_message())
2735 except exception.GroupAffinityViolation as e:
2736 LOG.exception('Failed to build and run instance',
2737 instance=instance)
2738 self._notify_about_instance_usage(context, instance,
2739 'create.error', fault=e)
2740 compute_utils.notify_about_instance_create(
2741 context, instance, self.host,
2742 phase=fields.NotificationPhase.ERROR, exception=e,
2743 bdms=block_device_mapping)
2744 raise exception.RescheduledByPolicyException(
2745 instance_uuid=instance.uuid, reason=str(e))
2746 except Exception as e:
2747 LOG.exception('Failed to build and run instance',
2748 instance=instance)
2749 self._notify_about_instance_usage(context, instance,
2750 'create.error', fault=e)
2751 compute_utils.notify_about_instance_create(
2752 context, instance, self.host,
2753 phase=fields.NotificationPhase.ERROR, exception=e,
2754 bdms=block_device_mapping)
2755 raise exception.RescheduledException(
2756 instance_uuid=instance.uuid, reason=str(e))
2758 # NOTE(alaski): This is only useful during reschedules, remove it now.
2759 instance.system_metadata.pop('network_allocated', None)
2761 # If CONF.default_access_ip_network_name is set, grab the
2762 # corresponding network and set the access ip values accordingly.
2763 network_name = CONF.default_access_ip_network_name
2764 if (network_name and not instance.access_ip_v4 and
2765 not instance.access_ip_v6):
2766 # Note that when there are multiple ips to choose from, an
2767 # arbitrary one will be chosen.
2768 for vif in network_info: 2768 ↛ 2777line 2768 didn't jump to line 2777 because the loop on line 2768 didn't complete
2769 if vif['network']['label'] == network_name: 2769 ↛ 2768line 2769 didn't jump to line 2768 because the condition on line 2769 was always true
2770 for ip in vif.fixed_ips():
2771 if not instance.access_ip_v4 and ip['version'] == 4:
2772 instance.access_ip_v4 = ip['address']
2773 if not instance.access_ip_v6 and ip['version'] == 6:
2774 instance.access_ip_v6 = ip['address']
2775 break
2777 self._update_instance_after_spawn(instance)
2779 try:
2780 instance.save(expected_task_state=task_states.SPAWNING)
2781 except (exception.InstanceNotFound,
2782 exception.UnexpectedDeletingTaskStateError) as e:
2783 with excutils.save_and_reraise_exception():
2784 self._notify_about_instance_usage(context, instance,
2785 'create.error', fault=e)
2786 compute_utils.notify_about_instance_create(
2787 context, instance, self.host,
2788 phase=fields.NotificationPhase.ERROR, exception=e,
2789 bdms=block_device_mapping)
2791 self._update_scheduler_instance_info(context, instance)
2792 self._notify_about_instance_usage(context, instance, 'create.end',
2793 extra_usage_info={'message': _('Success')},
2794 network_info=network_info)
2795 compute_utils.notify_about_instance_create(context, instance,
2796 self.host, phase=fields.NotificationPhase.END,
2797 bdms=block_device_mapping)
2799 def _build_resources_cleanup(self, instance, network_info):
2800 # Make sure the async call finishes
2801 if network_info is not None: 2801 ↛ 2805line 2801 didn't jump to line 2805 because the condition on line 2801 was always true
2802 network_info.wait(do_raise=False)
2803 self.driver.clean_networks_preparation(instance,
2804 network_info)
2805 self.driver.failed_spawn_cleanup(instance)
2807 @contextlib.contextmanager
2808 def _build_resources(self, context, instance, requested_networks,
2809 security_groups, image_meta, block_device_mapping,
2810 resource_provider_mapping, accel_uuids):
2811 resources = {}
2812 network_info = None
2813 spec_arqs = {}
2814 network_arqs = {}
2816 try:
2817 if accel_uuids:
2818 arqs = self._get_bound_arq_resources(
2819 context, instance, accel_uuids)
2820 spec_arqs, network_arqs = self._split_network_arqs(
2821 arqs, requested_networks)
2822 LOG.debug("ARQs for spec:%s, ARQs for network:%s",
2823 spec_arqs, network_arqs)
2824 except (Exception, eventlet.timeout.Timeout) as exc:
2825 LOG.exception(exc)
2826 # ARQs created for instance or ports.
2827 # The port binding isn't done yet.
2828 # Unbind port won't clean port ARQs.
2829 compute_utils.delete_arqs_if_needed(
2830 context, instance, accel_uuids)
2831 msg = _('Failure getting accelerator requests.')
2832 raise exception.BuildAbortException(
2833 reason=msg, instance_uuid=instance.uuid)
2835 try:
2836 LOG.debug('Start building networks asynchronously for instance.',
2837 instance=instance)
2838 network_info = self._build_networks_for_instance(context, instance,
2839 requested_networks, security_groups,
2840 resource_provider_mapping, network_arqs)
2841 resources['network_info'] = network_info
2842 except (exception.InstanceNotFound,
2843 exception.UnexpectedDeletingTaskStateError):
2844 raise
2845 except exception.UnexpectedTaskStateError as e:
2846 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2847 reason=e.format_message())
2848 except Exception:
2849 # Because this allocation is async any failures are likely to occur
2850 # when the driver accesses network_info during spawn().
2851 LOG.exception('Failed to allocate network(s)',
2852 instance=instance)
2853 msg = _('Failed to allocate the network(s), not rescheduling.')
2854 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2855 reason=msg)
2857 try:
2858 # Perform any driver preparation work for the driver.
2859 self.driver.prepare_for_spawn(instance)
2861 # Depending on a virt driver, some network configuration is
2862 # necessary before preparing block devices.
2863 self.driver.prepare_networks_before_block_device_mapping(
2864 instance, network_info)
2866 # Verify that all the BDMs have a device_name set and assign a
2867 # default to the ones missing it with the help of the driver.
2868 self._default_block_device_names(instance, image_meta,
2869 block_device_mapping)
2871 LOG.debug('Start building block device mappings for instance.',
2872 instance=instance)
2873 instance.vm_state = vm_states.BUILDING
2874 instance.task_state = task_states.BLOCK_DEVICE_MAPPING
2875 instance.save()
2877 block_device_info = self._prep_block_device(context, instance,
2878 block_device_mapping)
2879 resources['block_device_info'] = block_device_info
2880 except (exception.InstanceNotFound,
2881 exception.UnexpectedDeletingTaskStateError,
2882 exception.ComputeResourcesUnavailable):
2883 with excutils.save_and_reraise_exception():
2884 self._build_resources_cleanup(instance, network_info)
2885 except (exception.UnexpectedTaskStateError,
2886 exception.InstanceUnacceptable,
2887 exception.OverQuota, exception.InvalidBDM) as e:
2888 self._build_resources_cleanup(instance, network_info)
2889 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2890 reason=e.format_message())
2891 except Exception:
2892 LOG.exception('Failure prepping block device',
2893 instance=instance)
2894 self._build_resources_cleanup(instance, network_info)
2895 msg = _('Failure prepping block device.')
2896 raise exception.BuildAbortException(instance_uuid=instance.uuid,
2897 reason=msg)
2899 resources['accel_info'] = list(spec_arqs.values())
2900 try:
2901 yield resources
2902 except Exception as exc:
2903 with excutils.save_and_reraise_exception() as ctxt:
2904 if not isinstance(exc, (
2905 exception.InstanceNotFound,
2906 exception.UnexpectedDeletingTaskStateError)):
2907 LOG.exception('Instance failed to spawn',
2908 instance=instance)
2909 # Make sure the async call finishes
2910 if network_info is not None: 2910 ↛ 2916line 2910 didn't jump to line 2916 because the condition on line 2910 was always true
2911 network_info.wait(do_raise=False)
2912 # if network_info is empty we're likely here because of
2913 # network allocation failure. Since nothing can be reused on
2914 # rescheduling it's better to deallocate network to eliminate
2915 # the chance of orphaned ports in neutron
2916 deallocate_networks = False if network_info else True
2917 try:
2918 self._shutdown_instance(context, instance,
2919 block_device_mapping, requested_networks,
2920 try_deallocate_networks=deallocate_networks)
2921 except Exception as exc2:
2922 ctxt.reraise = False
2923 LOG.warning('Could not clean up failed build,'
2924 ' not rescheduling. Error: %s',
2925 str(exc2))
2926 raise exception.BuildAbortException(
2927 instance_uuid=instance.uuid,
2928 reason=str(exc))
2929 finally:
2930 # Call Cyborg to delete accelerator requests
2931 if accel_uuids: 2931 ↛ 2935line 2931 didn't jump to line 2935 because the condition on line 2931 was never true
2932 # ARQs created for instance or ports.
2933 # Port bind may not successful.
2934 # unbind port won't clean port ARQs.
2935 compute_utils.delete_arqs_if_needed(
2936 context, instance, accel_uuids)
2938 def _get_bound_arq_resources(self, context, instance, arq_uuids):
2939 """Get bound accelerator requests.
2941 The ARQ binding was kicked off in the conductor as an async
2942 operation. Here we wait for the notification from Cyborg.
2944 If the notification arrived before this point, which can happen
2945 in many/most cases (see [1]), it will be lost. To handle that,
2946 we use exit_wait_early.
2947 [1] https://review.opendev.org/#/c/631244/46/nova/compute/
2948 manager.py@2627
2950 :param instance: instance object
2951 :param arq_uuids: List of accelerator request (ARQ) UUIDs.
2952 :returns: List of ARQs for which bindings have completed,
2953 successfully or otherwise
2954 """
2956 cyclient = cyborg.get_client(context)
2957 if arq_uuids is None:
2958 arqs = cyclient.get_arqs_for_instance(instance.uuid)
2959 arq_uuids = [arq['uuid'] for arq in arqs]
2960 events = [('accelerator-request-bound', arq_uuid)
2961 for arq_uuid in arq_uuids]
2963 timeout = CONF.arq_binding_timeout
2964 with self.virtapi.wait_for_instance_event(
2965 instance, events, deadline=timeout):
2966 resolved_arqs = cyclient.get_arqs_for_instance(
2967 instance.uuid, only_resolved=True)
2968 # Events for these resolved ARQs may have already arrived.
2969 # Such 'early' events need to be ignored.
2970 early_events = [('accelerator-request-bound', arq['uuid'])
2971 for arq in resolved_arqs]
2972 if early_events:
2973 self.virtapi.exit_wait_early(early_events)
2975 # Since a timeout in wait_for_instance_event will raise, we get
2976 # here only if all binding events have been received.
2977 resolved_uuids = [arq['uuid'] for arq in resolved_arqs]
2978 if sorted(resolved_uuids) != sorted(arq_uuids):
2979 # Query Cyborg to get all.
2980 arqs = cyclient.get_arqs_for_instance(instance.uuid)
2981 else:
2982 arqs = resolved_arqs
2983 return arqs
2985 def _split_network_arqs(self, arqs, requested_networks):
2986 """split arq request by extra spec from ARQ requested by port.
2988 Return ARQ groups tuple:(spec_arqs, port_arqs)
2989 Each item in the tuple is a dict like:
2990 {
2991 arq1_uuid: arq1
2992 }
2993 """
2994 port_arqs = {}
2995 spec_arqs = {}
2996 port_arqs_uuids = [req_net.arq_uuid for req_net in requested_networks]
2997 for arq in arqs:
2998 if arq['uuid'] in port_arqs_uuids:
2999 port_arqs.update({arq['uuid']: arq})
3000 else:
3001 spec_arqs.update({arq['uuid']: arq})
3003 return spec_arqs, port_arqs
3005 def _cleanup_allocated_networks(self, context, instance,
3006 requested_networks):
3007 """Cleanup networks allocated for instance.
3009 :param context: nova request context
3010 :param instance: nova.objects.instance.Instance object
3011 :param requested_networks: nova.objects.NetworkRequestList
3012 """
3013 LOG.debug('Unplugging VIFs for instance', instance=instance)
3015 network_info = instance.get_network_info()
3017 # NOTE(stephenfin) to avoid nova destroying the instance without
3018 # unplugging the interface, refresh network_info if it is empty.
3019 if not network_info: 3019 ↛ 3032line 3019 didn't jump to line 3032 because the condition on line 3019 was always true
3020 try:
3021 network_info = self.network_api.get_instance_nw_info(
3022 context, instance,
3023 )
3024 except Exception as exc:
3025 LOG.warning(
3026 'Failed to update network info cache when cleaning up '
3027 'allocated networks. Stale VIFs may be left on this host.'
3028 'Error: %s', str(exc)
3029 )
3030 return
3032 try:
3033 self.driver.unplug_vifs(instance, network_info)
3034 except NotImplementedError:
3035 # This is an optional method so ignore things if it doesn't exist
3036 LOG.debug(
3037 'Virt driver does not provide unplug_vifs method, so it '
3038 'is not possible determine if VIFs should be unplugged.'
3039 )
3040 except Exception as exc:
3041 # It's possible that the instance never got as far as plugging
3042 # VIFs, in which case we would see an exception which can be
3043 # mostly ignored
3044 LOG.warning(
3045 'Cleaning up VIFs failed for instance. Error: %s',
3046 str(exc), instance=instance,
3047 )
3048 else:
3049 LOG.debug('Unplugged VIFs for instance', instance=instance)
3051 try:
3052 self._deallocate_network(context, instance, requested_networks)
3053 except Exception:
3054 LOG.exception('Failed to deallocate networks', instance=instance)
3055 return
3057 instance.system_metadata['network_allocated'] = 'False'
3058 try:
3059 instance.save()
3060 except exception.InstanceNotFound:
3061 # NOTE(alaski): It's possible that we're cleaning up the networks
3062 # because the instance was deleted. If that's the case then this
3063 # exception will be raised by instance.save()
3064 pass
3066 def _try_deallocate_network(self, context, instance,
3067 requested_networks=None):
3069 # During auto-scale cleanup, we could be deleting a large number
3070 # of servers at the same time and overloading parts of the system,
3071 # so we retry a few times in case of connection failures to the
3072 # networking service.
3073 @loopingcall.RetryDecorator(
3074 max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,
3075 exceptions=(keystone_exception.connection.ConnectFailure,))
3076 def _deallocate_network_with_retries():
3077 try:
3078 self._deallocate_network(
3079 context, instance, requested_networks)
3080 except keystone_exception.connection.ConnectFailure as e:
3081 # Provide a warning that something is amiss.
3082 with excutils.save_and_reraise_exception():
3083 LOG.warning('Failed to deallocate network for instance; '
3084 'retrying. Error: %s', str(e),
3085 instance=instance)
3087 try:
3088 # tear down allocated network structure
3089 _deallocate_network_with_retries()
3090 except Exception as ex:
3091 with excutils.save_and_reraise_exception():
3092 LOG.error('Failed to deallocate network for instance. '
3093 'Error: %s', ex, instance=instance)
3094 self._set_instance_obj_error_state(instance)
3096 def _get_power_off_values(self, instance, clean_shutdown):
3097 """Get the timing configuration for powering down this instance."""
3098 if clean_shutdown:
3099 timeout = compute_utils.get_value_from_system_metadata(instance,
3100 key='image_os_shutdown_timeout', type=int,
3101 default=CONF.shutdown_timeout)
3102 retry_interval = CONF.compute.shutdown_retry_interval
3103 else:
3104 timeout = 0
3105 retry_interval = 0
3107 return timeout, retry_interval
3109 def _power_off_instance(self, context, instance, clean_shutdown=True):
3110 """Power off an instance on this host."""
3111 share_info = self._get_share_info(context, instance)
3112 timeout, retry_interval = self._get_power_off_values(
3113 instance, clean_shutdown)
3114 self.driver.power_off(instance, timeout, retry_interval)
3115 share_info.deactivate_all()
3116 self._umount_all_shares(context, instance, share_info)
3118 def _shutdown_instance(self, context, instance,
3119 bdms, requested_networks=None, notify=True,
3120 try_deallocate_networks=True):
3121 """Shutdown an instance on this host.
3123 :param:context: security context
3124 :param:instance: a nova.objects.Instance object
3125 :param:bdms: the block devices for the instance to be torn
3126 down
3127 :param:requested_networks: the networks on which the instance
3128 has ports
3129 :param:notify: true if a final usage notification should be
3130 emitted
3131 :param:try_deallocate_networks: false if we should avoid
3132 trying to teardown networking
3133 """
3134 context = context.elevated()
3135 LOG.info('Terminating instance', instance=instance)
3137 if notify:
3138 self._notify_about_instance_usage(context, instance,
3139 "shutdown.start")
3140 compute_utils.notify_about_instance_action(context, instance,
3141 self.host, action=fields.NotificationAction.SHUTDOWN,
3142 phase=fields.NotificationPhase.START, bdms=bdms)
3144 network_info = instance.get_network_info()
3146 share_info = self._get_share_info(
3147 context, instance, check_status=False
3148 )
3150 # NOTE(arnaudmorin) to avoid nova destroying the instance without
3151 # unplugging the interface, refresh network_info if it is empty.
3152 if not network_info:
3153 network_info = self.network_api.get_instance_nw_info(
3154 context, instance)
3156 # NOTE(vish) get bdms before destroying the instance
3157 vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
3158 block_device_info = self._get_instance_block_device_info(
3159 context, instance, bdms=bdms)
3161 # NOTE(melwitt): attempt driver destroy before releasing ip, may
3162 # want to keep ip allocated for certain failures
3163 try:
3164 LOG.debug('Start destroying the instance on the hypervisor.',
3165 instance=instance)
3166 with timeutils.StopWatch() as timer:
3167 self.driver.destroy(context, instance, network_info,
3168 block_device_info)
3169 LOG.info('Took %0.2f seconds to destroy the instance on the '
3170 'hypervisor.', timer.elapsed(), instance=instance)
3171 except exception.InstancePowerOffFailure:
3172 # if the instance can't power off, don't release the ip
3173 with excutils.save_and_reraise_exception():
3174 pass
3175 except Exception:
3176 with excutils.save_and_reraise_exception():
3177 # deallocate ip and fail without proceeding to
3178 # volume api calls, preserving current behavior
3179 if try_deallocate_networks: 3179 ↛ 3183line 3179 didn't jump to line 3183
3180 self._try_deallocate_network(context, instance,
3181 requested_networks)
3183 if try_deallocate_networks:
3184 self._try_deallocate_network(context, instance, requested_networks)
3186 timer.restart()
3188 for share in share_info:
3189 # If we fail umounting or denying the share we may have a
3190 # dangling share_mapping in the DB (share_mapping entry with an
3191 # instance that does not exist anymore).
3192 try:
3193 self._umount_share(context, instance, share)
3194 share.deactivate()
3195 self.deny_share(context, instance, share)
3196 except (
3197 exception.ShareUmountError,
3198 exception.ShareNotFound,
3199 exception.ShareAccessNotFound,
3200 exception.ShareAccessRemovalError,
3201 ):
3202 LOG.warning("An error occurred while unmounting or "
3203 "denying the share '%s'. This error is ignored to "
3204 "proceed with instance removal. "
3205 "Consequently, there may be a dangling "
3206 "share_mapping.", share.share_id)
3208 connector = None
3209 for bdm in vol_bdms:
3210 try:
3211 if bdm.attachment_id:
3212 self.volume_api.attachment_delete(context,
3213 bdm.attachment_id)
3214 else:
3215 # NOTE(vish): actual driver detach done in driver.destroy,
3216 # so just tell cinder that we are done with it.
3217 if connector is None:
3218 connector = self.driver.get_volume_connector(instance)
3219 self.volume_api.terminate_connection(context,
3220 bdm.volume_id,
3221 connector)
3222 self.volume_api.detach(context, bdm.volume_id,
3223 instance.uuid)
3225 except exception.VolumeAttachmentNotFound as exc:
3226 LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
3227 instance=instance)
3228 except exception.DiskNotFound as exc:
3229 LOG.debug('Ignoring DiskNotFound: %s', exc,
3230 instance=instance)
3231 except exception.VolumeNotFound as exc:
3232 LOG.debug('Ignoring VolumeNotFound: %s', exc,
3233 instance=instance)
3234 except (cinder_exception.EndpointNotFound,
3235 keystone_exception.EndpointNotFound) as exc:
3236 LOG.warning('Ignoring EndpointNotFound for '
3237 'volume %(volume_id)s: %(exc)s',
3238 {'exc': exc, 'volume_id': bdm.volume_id},
3239 instance=instance)
3240 except cinder_exception.ClientException as exc:
3241 LOG.warning('Ignoring unknown cinder exception for '
3242 'volume %(volume_id)s: %(exc)s',
3243 {'exc': exc, 'volume_id': bdm.volume_id},
3244 instance=instance)
3245 except Exception as exc:
3246 LOG.warning('Ignoring unknown exception for '
3247 'volume %(volume_id)s: %(exc)s',
3248 {'exc': exc, 'volume_id': bdm.volume_id},
3249 instance=instance)
3250 if vol_bdms:
3251 LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
3252 'for instance.',
3253 {'time': timer.elapsed(), 'num': len(vol_bdms)},
3254 instance=instance)
3256 if notify:
3257 self._notify_about_instance_usage(context, instance,
3258 "shutdown.end")
3259 compute_utils.notify_about_instance_action(context, instance,
3260 self.host, action=fields.NotificationAction.SHUTDOWN,
3261 phase=fields.NotificationPhase.END, bdms=bdms)
3263 def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
3264 detach=True):
3265 original_exception = None
3266 for bdm in bdms:
3267 if detach and bdm.volume_id:
3268 try:
3269 LOG.debug("Detaching volume: %s", bdm.volume_id,
3270 instance_uuid=instance.uuid)
3271 destroy = bdm.delete_on_termination
3272 self._detach_volume(context, bdm, instance,
3273 destroy_bdm=destroy)
3274 except Exception as exc:
3275 original_exception = exc
3276 LOG.warning('Failed to detach volume: %(volume_id)s '
3277 'due to %(exc)s',
3278 {'volume_id': bdm.volume_id, 'exc': exc})
3280 if bdm.volume_id and bdm.delete_on_termination:
3281 try:
3282 LOG.debug("Deleting volume: %s", bdm.volume_id,
3283 instance_uuid=instance.uuid)
3284 self.volume_api.delete(context, bdm.volume_id)
3285 except Exception as exc:
3286 original_exception = exc
3287 LOG.warning('Failed to delete volume: %(volume_id)s '
3288 'due to %(exc)s',
3289 {'volume_id': bdm.volume_id, 'exc': exc})
3290 if original_exception is not None and raise_exc:
3291 raise original_exception
3293 def _delete_instance(self, context, instance, bdms):
3294 """Delete an instance on this host.
3296 :param context: nova request context
3297 :param instance: nova.objects.instance.Instance object
3298 :param bdms: nova.objects.block_device.BlockDeviceMappingList object
3299 """
3300 events = self.instance_events.clear_events_for_instance(instance)
3301 if events: 3301 ↛ 3302line 3301 didn't jump to line 3302 because the condition on line 3301 was never true
3302 LOG.debug('Events pending at deletion: %(events)s',
3303 {'events': ','.join(events.keys())},
3304 instance=instance)
3305 self._notify_about_instance_usage(context, instance,
3306 "delete.start")
3307 compute_utils.notify_about_instance_action(context, instance,
3308 self.host, action=fields.NotificationAction.DELETE,
3309 phase=fields.NotificationPhase.START, bdms=bdms)
3311 self._shutdown_instance(context, instance, bdms)
3313 # NOTE(vish): We have already deleted the instance, so we have
3314 # to ignore problems cleaning up the volumes. It
3315 # would be nice to let the user know somehow that
3316 # the volume deletion failed, but it is not
3317 # acceptable to have an instance that can not be
3318 # deleted. Perhaps this could be reworked in the
3319 # future to set an instance fault the first time
3320 # and to only ignore the failure if the instance
3321 # is already in ERROR.
3323 # NOTE(ameeda): The volumes have already been detached during
3324 # the above _shutdown_instance() call and this is
3325 # why detach is not requested from
3326 # _cleanup_volumes() in this case
3328 self._cleanup_volumes(context, instance, bdms,
3329 raise_exc=False, detach=False)
3330 # Delete Cyborg ARQs if the instance has a device profile.
3331 compute_utils.delete_arqs_if_needed(context, instance)
3332 # if a delete task succeeded, always update vm state and task
3333 # state without expecting task state to be DELETING
3334 instance.vm_state = vm_states.DELETED
3335 instance.task_state = None
3336 instance.power_state = power_state.NOSTATE
3337 instance.terminated_at = timeutils.utcnow()
3338 instance.save()
3340 self._complete_deletion(context, instance)
3341 # only destroy the instance in the db if the _complete_deletion
3342 # doesn't raise and therefore allocation is successfully
3343 # deleted in placement
3344 instance.destroy()
3346 self._notify_about_instance_usage(context, instance, "delete.end")
3347 compute_utils.notify_about_instance_action(context, instance,
3348 self.host, action=fields.NotificationAction.DELETE,
3349 phase=fields.NotificationPhase.END, bdms=bdms)
3351 @wrap_exception()
3352 @reverts_task_state
3353 @wrap_instance_event(prefix='compute')
3354 @wrap_instance_fault
3355 def terminate_instance(self, context, instance, bdms):
3356 """Terminate an instance on this host."""
3357 @utils.synchronized(instance.uuid)
3358 def do_terminate_instance(instance, bdms):
3359 # NOTE(mriedem): If we are deleting the instance while it was
3360 # booting from volume, we could be racing with a database update of
3361 # the BDM volume_id. Since the compute API passes the BDMs over RPC
3362 # to compute here, the BDMs may be stale at this point. So check
3363 # for any volume BDMs that don't have volume_id set and if we
3364 # detect that, we need to refresh the BDM list before proceeding.
3365 # TODO(mriedem): Move this into _delete_instance and make the bdms
3366 # parameter optional.
3367 for bdm in list(bdms):
3368 if bdm.is_volume and not bdm.volume_id:
3369 LOG.debug('There are potentially stale BDMs during '
3370 'delete, refreshing the BlockDeviceMappingList.',
3371 instance=instance)
3372 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
3373 context, instance.uuid)
3374 break
3375 try:
3376 self._delete_instance(context, instance, bdms)
3377 except exception.InstanceNotFound:
3378 LOG.info("Instance disappeared during terminate",
3379 instance=instance)
3380 except Exception:
3381 # As we're trying to delete always go to Error if something
3382 # goes wrong that _delete_instance can't handle.
3383 with excutils.save_and_reraise_exception():
3384 LOG.exception('Setting instance vm_state to ERROR',
3385 instance=instance)
3386 self._set_instance_obj_error_state(instance)
3388 do_terminate_instance(instance, bdms)
3390 # NOTE(johannes): This is probably better named power_off_instance
3391 # so it matches the driver method, but because of other issues, we
3392 # can't use that name in grizzly.
3393 @wrap_exception()
3394 @reverts_task_state
3395 @wrap_instance_event(prefix='compute')
3396 @wrap_instance_fault
3397 def stop_instance(self, context, instance, clean_shutdown):
3398 """Stopping an instance on this host."""
3400 @utils.synchronized(instance.uuid)
3401 def do_stop_instance():
3402 current_power_state = self._get_power_state(instance)
3404 LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
3405 'current task_state: %(task_state)s, current DB '
3406 'power_state: %(db_power_state)s, current VM '
3407 'power_state: %(current_power_state)s',
3408 {'vm_state': instance.vm_state,
3409 'task_state': instance.task_state,
3410 'db_power_state': instance.power_state,
3411 'current_power_state': current_power_state},
3412 instance_uuid=instance.uuid)
3414 # NOTE(mriedem): If the instance is already powered off, we are
3415 # possibly tearing down and racing with other operations, so we can
3416 # expect the task_state to be None if something else updates the
3417 # instance and we're not locking it.
3418 expected_task_state = [task_states.POWERING_OFF]
3419 # The list of power states is from _sync_instance_power_state.
3420 if current_power_state in (power_state.NOSTATE,
3421 power_state.SHUTDOWN,
3422 power_state.CRASHED):
3423 LOG.info('Instance is already powered off in the '
3424 'hypervisor when stop is called.',
3425 instance=instance)
3426 expected_task_state.append(None)
3428 self._notify_about_instance_usage(context, instance,
3429 "power_off.start")
3431 compute_utils.notify_about_instance_action(context, instance,
3432 self.host, action=fields.NotificationAction.POWER_OFF,
3433 phase=fields.NotificationPhase.START)
3435 self._power_off_instance(context, instance, clean_shutdown)
3436 instance.power_state = self._get_power_state(instance)
3437 instance.vm_state = vm_states.STOPPED
3438 instance.task_state = None
3439 instance.save(expected_task_state=expected_task_state)
3440 self._notify_about_instance_usage(context, instance,
3441 "power_off.end")
3443 compute_utils.notify_about_instance_action(context, instance,
3444 self.host, action=fields.NotificationAction.POWER_OFF,
3445 phase=fields.NotificationPhase.END)
3447 do_stop_instance()
3449 def _power_on(self, context, instance):
3450 network_info = self.network_api.get_instance_nw_info(context, instance)
3451 block_device_info = self._get_instance_block_device_info(context,
3452 instance)
3453 accel_info = self._get_accel_info(context, instance)
3455 share_info = self._get_share_info(context, instance)
3457 self._mount_all_shares(context, instance, share_info)
3458 self.driver.power_on(context, instance,
3459 network_info,
3460 block_device_info, accel_info, share_info)
3461 share_info.activate_all()
3463 def _delete_snapshot_of_shelved_instance(self, context, instance,
3464 snapshot_id):
3465 """Delete snapshot of shelved instance."""
3466 try:
3467 self.image_api.delete(context, snapshot_id)
3468 except (exception.ImageNotFound,
3469 exception.ImageNotAuthorized) as exc:
3470 LOG.warning("Failed to delete snapshot "
3471 "from shelved instance (%s).",
3472 exc.format_message(), instance=instance)
3473 except Exception:
3474 LOG.exception("Something wrong happened when trying to "
3475 "delete snapshot from shelved instance.",
3476 instance=instance)
3478 # NOTE(johannes): This is probably better named power_on_instance
3479 # so it matches the driver method, but because of other issues, we
3480 # can't use that name in grizzly.
3481 @wrap_exception()
3482 @reverts_task_state
3483 @wrap_instance_event(prefix='compute')
3484 @wrap_instance_fault
3485 def start_instance(self, context, instance):
3486 """Starting an instance on this host."""
3487 self._notify_about_instance_usage(context, instance, "power_on.start")
3488 compute_utils.notify_about_instance_action(context, instance,
3489 self.host, action=fields.NotificationAction.POWER_ON,
3490 phase=fields.NotificationPhase.START)
3491 self._power_on(context, instance)
3492 instance.power_state = self._get_power_state(instance)
3493 instance.vm_state = vm_states.ACTIVE
3494 instance.task_state = None
3496 # Delete an image(VM snapshot) for a shelved instance
3497 snapshot_id = instance.system_metadata.get('shelved_image_id')
3498 if snapshot_id:
3499 self._delete_snapshot_of_shelved_instance(context, instance,
3500 snapshot_id)
3502 # Delete system_metadata for a shelved instance
3503 compute_utils.remove_shelved_keys_from_system_metadata(instance)
3505 instance.save(expected_task_state=task_states.POWERING_ON)
3506 self._notify_about_instance_usage(context, instance, "power_on.end")
3507 compute_utils.notify_about_instance_action(context, instance,
3508 self.host, action=fields.NotificationAction.POWER_ON,
3509 phase=fields.NotificationPhase.END)
3511 @messaging.expected_exceptions(NotImplementedError,
3512 exception.TriggerCrashDumpNotSupported,
3513 exception.InstanceNotRunning)
3514 @wrap_exception()
3515 @wrap_instance_event(prefix='compute')
3516 @wrap_instance_fault
3517 def trigger_crash_dump(self, context, instance):
3518 """Trigger crash dump in an instance."""
3520 self._notify_about_instance_usage(context, instance,
3521 "trigger_crash_dump.start")
3522 compute_utils.notify_about_instance_action(context, instance,
3523 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
3524 phase=fields.NotificationPhase.START)
3526 # This method does not change task_state and power_state because the
3527 # effect of a trigger depends on user's configuration.
3528 self.driver.trigger_crash_dump(instance)
3530 self._notify_about_instance_usage(context, instance,
3531 "trigger_crash_dump.end")
3532 compute_utils.notify_about_instance_action(context, instance,
3533 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
3534 phase=fields.NotificationPhase.END)
3536 @wrap_exception()
3537 @reverts_task_state
3538 @wrap_instance_event(prefix='compute')
3539 @wrap_instance_fault
3540 def soft_delete_instance(self, context, instance):
3541 """Soft delete an instance on this host."""
3542 with compute_utils.notify_about_instance_delete(
3543 self.notifier, context, instance, 'soft_delete',
3544 source=fields.NotificationSource.COMPUTE):
3545 try:
3546 self.driver.soft_delete(instance)
3547 except NotImplementedError:
3548 # Fallback to just powering off the instance if the
3549 # hypervisor doesn't implement the soft_delete method
3550 self._power_off_instance(
3551 context, instance, clean_shutdown=False
3552 )
3553 instance.power_state = self._get_power_state(instance)
3554 instance.vm_state = vm_states.SOFT_DELETED
3555 instance.task_state = None
3556 instance.save(expected_task_state=[task_states.SOFT_DELETING])
3558 @wrap_exception()
3559 @reverts_task_state
3560 @wrap_instance_event(prefix='compute')
3561 @wrap_instance_fault
3562 def restore_instance(self, context, instance):
3563 """Restore a soft-deleted instance on this host."""
3564 self._notify_about_instance_usage(context, instance, "restore.start")
3565 compute_utils.notify_about_instance_action(context, instance,
3566 self.host, action=fields.NotificationAction.RESTORE,
3567 phase=fields.NotificationPhase.START)
3568 try:
3569 self.driver.restore(instance)
3570 except NotImplementedError:
3571 # Fallback to just powering on the instance if the hypervisor
3572 # doesn't implement the restore method
3573 self._power_on(context, instance)
3574 instance.power_state = self._get_power_state(instance)
3575 instance.vm_state = vm_states.ACTIVE
3576 instance.task_state = None
3577 instance.save(expected_task_state=task_states.RESTORING)
3578 self._notify_about_instance_usage(context, instance, "restore.end")
3579 compute_utils.notify_about_instance_action(context, instance,
3580 self.host, action=fields.NotificationAction.RESTORE,
3581 phase=fields.NotificationPhase.END)
3583 @staticmethod
3584 def _set_migration_status(migration, status):
3585 """Set the status, and guard against a None being passed in.
3587 This is useful as some of the compute RPC calls will not pass
3588 a migration object in older versions. The check can be removed when
3589 we move past 4.x major version of the RPC API.
3590 """
3591 if migration:
3592 migration.status = status
3593 migration.save()
3595 @staticmethod
3596 def _reimage_failed_callback(event_name, instance):
3597 msg = ('Cinder reported failure during reimaging '
3598 'with %(event)s for instance %(uuid)s')
3599 msg_args = {'event': event_name, 'uuid': instance.uuid}
3600 LOG.error(msg, msg_args)
3601 raise exception.ReimageException(msg % msg_args)
3603 def _detach_root_volume(self, context, instance, root_bdm):
3604 volume_id = root_bdm.volume_id
3605 mp = root_bdm.device_name
3606 old_connection_info = jsonutils.loads(root_bdm.connection_info)
3607 try:
3608 self.driver.detach_volume(context, old_connection_info,
3609 instance, root_bdm.device_name)
3610 except exception.DiskNotFound as err:
3611 LOG.warning('Ignoring DiskNotFound exception while '
3612 'detaching volume %(volume_id)s from '
3613 '%(mp)s : %(err)s',
3614 {'volume_id': volume_id, 'mp': mp,
3615 'err': err}, instance=instance)
3616 except exception.DeviceDetachFailed:
3617 with excutils.save_and_reraise_exception():
3618 LOG.warning('Guest refused to detach volume %(vol)s',
3619 {'vol': volume_id}, instance=instance)
3620 self.volume_api.roll_detaching(context, volume_id)
3621 except Exception:
3622 with excutils.save_and_reraise_exception():
3623 LOG.exception('Failed to detach volume '
3624 '%(volume_id)s from %(mp)s',
3625 {'volume_id': volume_id, 'mp': mp},
3626 instance=instance)
3627 self.volume_api.roll_detaching(context, volume_id)
3629 def _rebuild_volume_backed_instance(self, context, instance, bdms,
3630 image_id):
3631 # Get root bdm and attachment ID associated to it
3632 root_bdm = compute_utils.get_root_bdm(context, instance, bdms)
3633 old_attachment_id = root_bdm.attachment_id
3635 # Create a new attachment and delete the previous attachment
3636 # We create a new attachment first to keep the volume in
3637 # reserved state after old attachment is deleted and avoid any
3638 # races in between the attachment create and delete.
3639 attachment_id = None
3640 try:
3641 attachment_id = self.volume_api.attachment_create(
3642 context, root_bdm.volume_id, instance.uuid)['id']
3643 self._detach_root_volume(context, instance, root_bdm)
3644 root_bdm.attachment_id = attachment_id
3645 root_bdm.save()
3646 self.volume_api.attachment_delete(context,
3647 old_attachment_id)
3648 except exception.InstanceNotFound:
3649 # This means we failed to save the new attachment because
3650 # the instance is deleted, so (try to) delete it and abort.
3651 try:
3652 self.volume_api.attachment_delete(context,
3653 attachment_id)
3654 except cinder_exception.ClientException:
3655 LOG.error('Failed to delete new attachment %s',
3656 attachment_id)
3657 msg = _('Failed to rebuild volume backed instance.')
3658 raise exception.BuildAbortException(
3659 instance_uuid=instance.uuid, reason=msg)
3660 except cinder_exception.ClientException:
3661 if attachment_id:
3662 LOG.error('Failed to delete old attachment %s',
3663 old_attachment_id)
3664 else:
3665 LOG.error('Failed to create new attachment')
3666 msg = _('Failed to rebuild volume backed instance.')
3667 raise exception.BuildAbortException(
3668 instance_uuid=instance.uuid, reason=msg)
3669 events = [('volume-reimaged', root_bdm.volume_id)]
3671 # Get the image requested for rebuild
3672 try:
3673 image = self.image_api.get(context, image_id)
3674 except exception.ImageNotFound:
3675 msg = _('Image %s not found.') % image_id
3676 LOG.error(msg)
3677 raise exception.BuildAbortException(
3678 instance_uuid=instance.uuid, reason=msg)
3679 image_size = int(math.ceil(float(image.get('size')) / units.Gi))
3680 deadline = CONF.reimage_timeout_per_gb * image_size
3681 error_cb = self._reimage_failed_callback
3683 # Call cinder to perform reimage operation and wait until an
3684 # external event is triggered.
3685 try:
3686 with self.virtapi.wait_for_instance_event(instance, events,
3687 deadline=deadline,
3688 error_callback=error_cb):
3689 self.volume_api.reimage_volume(
3690 context, root_bdm.volume_id, image_id,
3691 reimage_reserved=True)
3693 except Exception as ex:
3694 LOG.error('Failed to rebuild volume backed instance: %s',
3695 str(ex), instance=instance)
3696 msg = _('Failed to rebuild volume backed instance.')
3697 raise exception.BuildAbortException(
3698 instance_uuid=instance.uuid, reason=msg)
3700 def _rebuild_default_impl(
3701 self, context, instance, image_meta, injected_files,
3702 admin_password, allocations, bdms, detach_block_devices,
3703 attach_block_devices, network_info=None, evacuate=False,
3704 block_device_info=None, preserve_ephemeral=False,
3705 accel_uuids=None, reimage_boot_volume=False):
3706 if preserve_ephemeral: 3706 ↛ 3709line 3706 didn't jump to line 3709 because the condition on line 3706 was never true
3707 # The default code path does not support preserving ephemeral
3708 # partitions.
3709 raise exception.PreserveEphemeralNotSupported()
3711 accel_info = []
3712 detach_root_bdm = not reimage_boot_volume
3713 if evacuate:
3714 if instance.flavor.extra_specs.get('accel:device_profile'): 3714 ↛ 3715line 3714 didn't jump to line 3715 because the condition on line 3714 was never true
3715 try:
3716 accel_info = self._get_bound_arq_resources(
3717 context, instance, accel_uuids or [])
3718 except (Exception, eventlet.timeout.Timeout) as exc:
3719 LOG.exception(exc)
3720 self._build_resources_cleanup(instance, network_info)
3721 msg = _('Failure getting accelerator resources.')
3722 raise exception.BuildAbortException(
3723 instance_uuid=instance.uuid, reason=msg)
3724 detach_block_devices(context, bdms,
3725 detach_root_bdm=detach_root_bdm)
3726 else:
3727 self._power_off_instance(context, instance, clean_shutdown=True)
3728 detach_block_devices(context, bdms,
3729 detach_root_bdm=detach_root_bdm)
3730 if reimage_boot_volume:
3731 # Previously, the calls reaching here were for image
3732 # backed instance rebuild and didn't have a root bdm
3733 # so now we need to handle the case for root bdm.
3734 # For the root BDM, we are doing attach/detach operations
3735 # manually as we want to maintain a 'reserved' state
3736 # throughout the reimage process from the cinder side so
3737 # we are excluding the root BDM from certain operations
3738 # here i.e. deleting it's mapping before the destroy call.
3739 block_device_info_copy = copy.deepcopy(block_device_info)
3740 root_bdm = compute_utils.get_root_bdm(context, instance, bdms)
3741 mapping = block_device_info_copy["block_device_mapping"]
3742 # drop root bdm from the mapping
3743 mapping = [
3744 bdm for bdm in mapping
3745 if bdm["volume_id"] != root_bdm.volume_id
3746 ]
3747 self.driver.destroy(context, instance,
3748 network_info=network_info,
3749 block_device_info=block_device_info_copy)
3750 else:
3751 self.driver.destroy(context, instance,
3752 network_info=network_info,
3753 block_device_info=block_device_info)
3754 try:
3755 accel_info = self._get_accel_info(context, instance)
3756 except Exception as exc:
3757 LOG.exception(exc)
3758 self._build_resources_cleanup(instance, network_info)
3759 msg = _('Failure getting accelerator resources.')
3760 raise exception.BuildAbortException(
3761 instance_uuid=instance.uuid, reason=msg)
3762 if reimage_boot_volume:
3763 is_volume_backed = compute_utils.is_volume_backed_instance(
3764 context, instance, bdms)
3765 if is_volume_backed:
3766 self._rebuild_volume_backed_instance(
3767 context, instance, bdms, image_meta.id)
3769 instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
3770 instance.save(expected_task_state=[task_states.REBUILDING])
3772 new_block_device_info = attach_block_devices(context, instance, bdms)
3774 instance.task_state = task_states.REBUILD_SPAWNING
3775 instance.save(
3776 expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
3778 with instance.mutated_migration_context():
3779 self.driver.spawn(context, instance, image_meta, injected_files,
3780 admin_password, allocations,
3781 network_info=network_info,
3782 block_device_info=new_block_device_info,
3783 accel_info=accel_info)
3785 def _notify_instance_rebuild_error(self, context, instance, error, bdms):
3786 self._notify_about_instance_usage(context, instance,
3787 'rebuild.error', fault=error)
3788 compute_utils.notify_about_instance_rebuild(
3789 context, instance, self.host,
3790 phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms)
3792 @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported,
3793 exception.BuildAbortException)
3794 @wrap_exception()
3795 @reverts_task_state
3796 @wrap_instance_event(prefix='compute')
3797 @wrap_instance_fault
3798 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
3799 injected_files, new_pass, orig_sys_metadata,
3800 bdms, recreate, on_shared_storage,
3801 preserve_ephemeral, migration,
3802 scheduled_node, limits, request_spec, accel_uuids,
3803 reimage_boot_volume=None, target_state=None):
3804 """Destroy and re-make this instance.
3806 A 'rebuild' effectively purges all existing data from the system and
3807 remakes the VM with given 'metadata' and 'personalities'.
3809 :param context: `nova.RequestContext` object
3810 :param instance: Instance object
3811 :param orig_image_ref: Original image_ref before rebuild
3812 :param image_ref: New image_ref for rebuild
3813 :param injected_files: Files to inject
3814 :param new_pass: password to set on rebuilt instance
3815 :param orig_sys_metadata: instance system metadata from pre-rebuild
3816 :param bdms: block-device-mappings to use for rebuild
3817 :param recreate: True if the instance is being evacuated (e.g. the
3818 hypervisor it was on failed) - cleanup of old state will be
3819 skipped.
3820 :param on_shared_storage: True if instance files on shared storage.
3821 If not provided then information from the
3822 driver will be used to decide if the instance
3823 files are available or not on the target host
3824 :param preserve_ephemeral: True if the default ephemeral storage
3825 partition must be preserved on rebuild
3826 :param migration: a Migration object if one was created for this
3827 rebuild operation (if it's a part of evacuate)
3828 :param scheduled_node: A node of the host chosen by the scheduler. If a
3829 host was specified by the user, this will be
3830 None
3831 :param limits: Overcommit limits set by the scheduler. If a host was
3832 specified by the user, this will be None
3833 :param request_spec: a RequestSpec object used to schedule the instance
3834 :param accel_uuids: a list of cyborg ARQ uuids
3835 :param reimage_boot_volume: Boolean to specify whether the user has
3836 explicitly requested to rebuild a boot
3837 volume or None if RPC version is <=6.0
3838 :param target_state: Set a target state for the evacuated instance or
3839 None if RPC version is <=6.1.
3841 """
3842 # recreate=True means the instance is being evacuated from a failed
3843 # host to a new destination host (this host). The 'recreate' variable
3844 # name is confusing, so rename it to evacuate here at the top, which
3845 # is simpler than renaming a parameter in an RPC versioned method.
3846 evacuate = recreate
3847 context = context.elevated()
3849 if evacuate:
3850 LOG.info("Evacuating instance", instance=instance)
3851 else:
3852 LOG.info("Rebuilding instance", instance=instance)
3854 if evacuate:
3855 # This is an evacuation to a new host, so we need to perform a
3856 # resource claim.
3857 rebuild_claim = self.rt.rebuild_claim
3858 else:
3859 # This is a rebuild to the same host, so we don't need to make
3860 # a claim since the instance is already on this host.
3861 rebuild_claim = claims.NopClaim
3863 if image_ref:
3864 image_meta = objects.ImageMeta.from_image_ref(
3865 context, self.image_api, image_ref)
3866 elif evacuate:
3867 # For evacuate the API does not send down the image_ref since the
3868 # image does not change so just get it from what was stashed in
3869 # the instance system_metadata when the instance was created (or
3870 # last rebuilt). This also works for volume-backed instances.
3871 image_meta = instance.image_meta
3872 else:
3873 image_meta = objects.ImageMeta()
3875 # NOTE(mriedem): On an evacuate, we need to update
3876 # the instance's host and node properties to reflect it's
3877 # destination node for the evacuate.
3878 if not scheduled_node:
3879 if evacuate:
3880 try:
3881 compute_node = self._get_compute_info(context, self.host)
3882 scheduled_node = compute_node.hypervisor_hostname
3883 except exception.ComputeHostNotFound as e:
3884 # This means we were asked to rebuild one of our own
3885 # instances, or another instance as a target of an
3886 # evacuation, but we are unable to find a matching compute
3887 # node.
3888 LOG.exception('Failed to get compute_info for %s',
3889 self.host)
3890 self._set_migration_status(migration, 'failed')
3891 self._notify_instance_rebuild_error(context, instance, e,
3892 bdms)
3893 raise exception.InstanceFaultRollback(
3894 inner_exception=exception.BuildAbortException(
3895 instance_uuid=instance.uuid,
3896 reason=e.format_message()))
3898 else:
3899 scheduled_node = instance.node
3901 allocs = self.reportclient.get_allocations_for_consumer(
3902 context, instance.uuid)
3904 # If the resource claim or group policy validation fails before we
3905 # do anything to the guest or its networking/volumes we want to keep
3906 # the current status rather than put the instance into ERROR status.
3907 instance_state = instance.vm_state
3908 with self._error_out_instance_on_exception(
3909 context, instance, instance_state=instance_state):
3910 try:
3911 self._do_rebuild_instance_with_claim(
3912 context, instance, orig_image_ref,
3913 image_meta, injected_files, new_pass, orig_sys_metadata,
3914 bdms, evacuate, on_shared_storage, preserve_ephemeral,
3915 migration, request_spec, allocs, rebuild_claim,
3916 scheduled_node, limits, accel_uuids, reimage_boot_volume,
3917 target_state)
3918 except (exception.ComputeResourcesUnavailable,
3919 exception.RescheduledException) as e:
3920 if isinstance(e, exception.ComputeResourcesUnavailable):
3921 LOG.debug("Could not rebuild instance on this host, not "
3922 "enough resources available.", instance=instance)
3923 else:
3924 # RescheduledException is raised by the late server group
3925 # policy check during evacuation if a parallel scheduling
3926 # violated the policy.
3927 # We catch the RescheduledException here but we don't have
3928 # the plumbing to do an actual reschedule so we abort the
3929 # operation.
3930 LOG.debug("Could not rebuild instance on this host, "
3931 "late server group check failed.",
3932 instance=instance)
3933 # NOTE(ndipanov): We just abort the build for now and leave a
3934 # migration record for potential cleanup later
3935 self._set_migration_status(migration, 'failed')
3936 # Since the claim failed, we need to remove the allocation
3937 # created against the destination node. Note that we can only
3938 # get here when evacuating to a destination node. Rebuilding
3939 # on the same host (not evacuate) uses the NopClaim which will
3940 # not raise ComputeResourcesUnavailable.
3941 self.rt.delete_allocation_for_evacuated_instance(
3942 context, instance, scheduled_node, node_type='destination')
3943 self._notify_instance_rebuild_error(context, instance, e, bdms)
3944 # Wrap this in InstanceFaultRollback so that the
3945 # _error_out_instance_on_exception context manager keeps the
3946 # vm_state unchanged.
3947 raise exception.InstanceFaultRollback(
3948 inner_exception=exception.BuildAbortException(
3949 instance_uuid=instance.uuid,
3950 reason=e.format_message()))
3951 except (exception.InstanceNotFound,
3952 exception.UnexpectedDeletingTaskStateError) as e:
3953 LOG.debug('Instance was deleted while rebuilding',
3954 instance=instance)
3955 self._set_migration_status(migration, 'failed')
3956 self._notify_instance_rebuild_error(context, instance, e, bdms)
3957 except Exception as e:
3958 self._set_migration_status(migration, 'failed')
3959 if evacuate or scheduled_node is not None:
3960 self.rt.delete_allocation_for_evacuated_instance(
3961 context, instance, scheduled_node,
3962 node_type='destination')
3963 self._notify_instance_rebuild_error(context, instance, e, bdms)
3964 raise
3965 else:
3966 # NOTE(gibi): Let the resource tracker set the instance
3967 # host and drop the migration context as we need to hold the
3968 # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with
3969 # _update_available_resources. See bug 1896463.
3970 self.rt.finish_evacuation(instance, scheduled_node, migration)
3972 def _do_rebuild_instance_with_claim(
3973 self, context, instance, orig_image_ref, image_meta,
3974 injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
3975 on_shared_storage, preserve_ephemeral, migration, request_spec,
3976 allocations, rebuild_claim, scheduled_node, limits, accel_uuids,
3977 reimage_boot_volume, target_state):
3978 """Helper to avoid deep nesting in the top-level method."""
3980 provider_mapping = None
3981 if evacuate:
3982 provider_mapping = self._get_request_group_mapping(request_spec)
3984 if provider_mapping:
3985 compute_utils.update_pci_request_with_placement_allocations(
3986 context,
3987 self.reportclient,
3988 instance.pci_requests.requests,
3989 provider_mapping,
3990 )
3992 claim_context = rebuild_claim(
3993 context, instance, scheduled_node, allocations,
3994 limits=limits, image_meta=image_meta, migration=migration)
3996 with claim_context:
3997 self._do_rebuild_instance(
3998 context, instance, orig_image_ref, image_meta, injected_files,
3999 new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage,
4000 preserve_ephemeral, migration, request_spec, allocations,
4001 provider_mapping, accel_uuids, reimage_boot_volume,
4002 target_state)
4004 @staticmethod
4005 def _get_image_name(image_meta):
4006 if image_meta.obj_attr_is_set("name"):
4007 return image_meta.name
4008 else:
4009 return ''
4011 def _do_rebuild_instance(
4012 self, context, instance, orig_image_ref, image_meta,
4013 injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
4014 on_shared_storage, preserve_ephemeral, migration, request_spec,
4015 allocations, request_group_resource_providers_mapping,
4016 accel_uuids, reimage_boot_volume, target_state):
4017 orig_vm_state = instance.vm_state
4019 if evacuate:
4020 if target_state and orig_vm_state != vm_states.ERROR: 4020 ↛ 4023line 4020 didn't jump to line 4023 because the condition on line 4020 was never true
4021 # This will ensure that at destination the instance will have
4022 # the desired state.
4023 if target_state not in vm_states.ALLOW_TARGET_STATES:
4024 raise exception.InstanceEvacuateNotSupportedTargetState(
4025 target_state=target_state)
4026 orig_vm_state = target_state
4028 if request_spec:
4029 # NOTE(gibi): Do a late check of server group policy as
4030 # parallel scheduling could violate such policy. This will
4031 # cause the evacuate to fail as rebuild does not implement
4032 # reschedule.
4033 hints = self._get_scheduler_hints({}, request_spec)
4034 self._validate_instance_group_policy(context, instance, hints)
4036 if not self.driver.capabilities.get("supports_evacuate", False):
4037 raise exception.InstanceEvacuateNotSupported
4039 self._check_instance_exists(instance)
4041 if on_shared_storage is None:
4042 LOG.debug('on_shared_storage is not provided, using driver '
4043 'information to decide if the instance needs to '
4044 'be evacuated')
4045 on_shared_storage = self.driver.instance_on_disk(instance)
4047 elif (on_shared_storage !=
4048 self.driver.instance_on_disk(instance)):
4049 # To cover case when admin expects that instance files are
4050 # on shared storage, but not accessible and vice versa
4051 raise exception.InvalidSharedStorage(
4052 _("Invalid state of instance files on shared"
4053 " storage"))
4055 if on_shared_storage:
4056 LOG.info('disk on shared storage, evacuating using'
4057 ' existing disk')
4058 elif instance.image_ref:
4059 orig_image_ref = instance.image_ref
4060 LOG.info("disk not on shared storage, evacuating from "
4061 "image: '%s'", str(orig_image_ref))
4062 else:
4063 LOG.info('disk on volume, evacuating using existing '
4064 'volume')
4066 # We check trusted certs capabilities for both evacuate (rebuild on
4067 # another host) and rebuild (rebuild on the same host) because for
4068 # evacuate we need to make sure an instance with trusted certs can
4069 # have the image verified with those certs during rebuild, and for
4070 # rebuild we could be rebuilding a server that started out with no
4071 # trusted certs on this host, and then was rebuilt with trusted certs
4072 # for a new image, in which case we need to validate that new image
4073 # with the trusted certs during the rebuild.
4074 self._check_trusted_certs(instance)
4076 # This instance.exists message should contain the original
4077 # image_ref, not the new one. Since the DB has been updated
4078 # to point to the new one... we have to override it.
4079 orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
4080 context)
4081 extra_usage_info = {'image_ref_url': orig_image_ref_url}
4082 compute_utils.notify_usage_exists(
4083 self.notifier, context, instance, self.host,
4084 current_period=True, system_metadata=orig_sys_metadata,
4085 extra_usage_info=extra_usage_info)
4087 # This message should contain the new image_ref
4088 extra_usage_info = {'image_name': self._get_image_name(image_meta)}
4089 self._notify_about_instance_usage(context, instance,
4090 "rebuild.start", extra_usage_info=extra_usage_info)
4091 # NOTE: image_name is not included in the versioned notification
4092 # because we already provide the image_uuid in the notification
4093 # payload and the image details can be looked up via the uuid.
4094 compute_utils.notify_about_instance_rebuild(
4095 context, instance, self.host,
4096 phase=fields.NotificationPhase.START,
4097 bdms=bdms)
4099 instance.power_state = self._get_power_state(instance)
4100 instance.task_state = task_states.REBUILDING
4101 instance.save(expected_task_state=[task_states.REBUILDING])
4103 if evacuate:
4104 self.network_api.setup_networks_on_host(
4105 context, instance, self.host)
4106 # For nova-network this is needed to move floating IPs
4107 # For neutron this updates the host in the port binding
4108 # TODO(cfriesen): this network_api call and the one above
4109 # are so similar, we should really try to unify them.
4110 self.network_api.setup_instance_network_on_host(
4111 context, instance, self.host, migration,
4112 provider_mappings=request_group_resource_providers_mapping)
4113 # TODO(mriedem): Consider decorating setup_instance_network_on_host
4114 # with @api.refresh_cache and then we wouldn't need this explicit
4115 # call to get_instance_nw_info.
4116 network_info = self.network_api.get_instance_nw_info(context,
4117 instance)
4118 else:
4119 network_info = instance.get_network_info()
4121 if bdms is None:
4122 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
4123 context, instance.uuid)
4125 block_device_info = \
4126 self._get_instance_block_device_info(
4127 context, instance, bdms=bdms)
4129 def detach_block_devices(context, bdms, detach_root_bdm=True):
4130 for bdm in bdms:
4131 # Previously, the calls made to this method by rebuild
4132 # instance operation were for image backed instances which
4133 # assumed we only had attached volumes and no root BDM.
4134 # Now we need to handle case for root BDM which we are
4135 # doing manually so skipping the attachment create/delete
4136 # calls from here.
4137 # The detach_root_bdm parameter is only passed while
4138 # rebuilding the volume backed instance so we don't have
4139 # to worry about other callers as they won't satisfy this
4140 # condition.
4141 # For evacuate case, we have detach_root_bdm always True
4142 # since we don't have reimage_boot_volume parameter in
4143 # this case so this will not be executed.
4144 if not detach_root_bdm and bdm.is_root: 4144 ↛ 4145line 4144 didn't jump to line 4145 because the condition on line 4144 was never true
4145 continue
4146 if bdm.is_volume: 4146 ↛ 4130line 4146 didn't jump to line 4130 because the condition on line 4146 was always true
4147 # NOTE (ildikov): Having the attachment_id set in the BDM
4148 # means that it's the new Cinder attach/detach flow
4149 # (available from v3.44). In that case we explicitly
4150 # attach and detach the volumes through attachment level
4151 # operations. In this scenario _detach_volume will delete
4152 # the existing attachment which would make the volume
4153 # status change to 'available' if we don't pre-create
4154 # another empty attachment before deleting the old one.
4155 attachment_id = None
4156 if bdm.attachment_id: 4156 ↛ 4157line 4156 didn't jump to line 4157 because the condition on line 4156 was never true
4157 attachment_id = self.volume_api.attachment_create(
4158 context, bdm['volume_id'], instance.uuid)['id']
4159 self._detach_volume(context, bdm, instance,
4160 destroy_bdm=False)
4161 if attachment_id: 4161 ↛ 4162line 4161 didn't jump to line 4162 because the condition on line 4161 was never true
4162 bdm.attachment_id = attachment_id
4163 bdm.save()
4165 files = self._decode_files(injected_files)
4167 kwargs = dict(
4168 context=context,
4169 instance=instance,
4170 image_meta=image_meta,
4171 injected_files=files,
4172 admin_password=new_pass,
4173 allocations=allocations,
4174 bdms=bdms,
4175 detach_block_devices=detach_block_devices,
4176 attach_block_devices=self._prep_block_device,
4177 block_device_info=block_device_info,
4178 network_info=network_info,
4179 preserve_ephemeral=preserve_ephemeral,
4180 evacuate=evacuate,
4181 accel_uuids=accel_uuids,
4182 reimage_boot_volume=reimage_boot_volume)
4183 try:
4184 with instance.mutated_migration_context():
4185 self.driver.rebuild(**kwargs)
4186 except NotImplementedError:
4187 # NOTE(rpodolyaka): driver doesn't provide specialized version
4188 # of rebuild, fall back to the default implementation
4189 self._rebuild_default_impl(**kwargs)
4190 self._update_instance_after_spawn(instance)
4191 instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
4193 if orig_vm_state == vm_states.STOPPED:
4194 LOG.info("bringing vm to original state: '%s'",
4195 orig_vm_state, instance=instance)
4196 instance.vm_state = vm_states.ACTIVE
4197 instance.task_state = task_states.POWERING_OFF
4198 instance.progress = 0
4199 instance.save()
4200 self.stop_instance(context, instance, False)
4201 # TODO(melwitt): We should clean up instance console tokens here in the
4202 # case of evacuate. The instance is on a new host and will need to
4203 # establish a new console connection.
4204 self._update_scheduler_instance_info(context, instance)
4205 self._notify_about_instance_usage(
4206 context, instance, "rebuild.end",
4207 network_info=network_info,
4208 extra_usage_info=extra_usage_info)
4209 compute_utils.notify_about_instance_rebuild(
4210 context, instance, self.host,
4211 phase=fields.NotificationPhase.END,
4212 bdms=bdms)
4214 def _handle_bad_volumes_detached(self, context, instance, bad_devices,
4215 block_device_info):
4216 """Handle cases where the virt-layer had to detach non-working volumes
4217 in order to complete an operation.
4218 """
4219 for bdm in block_device_info['block_device_mapping']:
4220 if bdm.get('mount_device') in bad_devices:
4221 try:
4222 volume_id = bdm['connection_info']['data']['volume_id']
4223 except KeyError:
4224 continue
4226 # NOTE(sirp): ideally we'd just call
4227 # `compute_api.detach_volume` here but since that hits the
4228 # DB directly, that's off limits from within the
4229 # compute-manager.
4230 #
4231 # API-detach
4232 LOG.info("Detaching from volume api: %s", volume_id)
4233 self.volume_api.begin_detaching(context, volume_id)
4235 # Manager-detach
4236 self.detach_volume(context, volume_id, instance)
4238 def _get_accel_info(self, context, instance):
4239 dp_name = instance.flavor.extra_specs.get('accel:device_profile')
4240 if dp_name:
4241 cyclient = cyborg.get_client(context)
4242 accel_info = cyclient.get_arqs_for_instance(instance.uuid)
4243 else:
4244 accel_info = []
4245 return accel_info
4247 def _delete_dangling_bdms(self, context, instance, bdms):
4248 """Deletes dangling or stale attachments for volume from
4249 Nova and Cinder DB so both service DBs can be in sync.
4251 Retrieves volume attachments from the Nova block_device_mapping
4252 table and verifies them with the Cinder volume_attachment table.
4253 If attachment is not present in any one of the DBs, delete
4254 attachments from the other DB.
4256 :param context: The nova request context.
4257 :param instance: instance object.
4258 :param bdms: BlockDeviceMappingList list object.
4259 """
4261 try:
4262 cinder_attachments = self.volume_api.attachment_get_all(
4263 context, instance.uuid)
4264 except (keystone_exception.EndpointNotFound,
4265 cinder_exception.ClientException):
4266 # if cinder is not deployed we never need to check for
4267 # attachments as there cannot be dangling bdms.
4268 # if we cannot connect to cinder we cannot check for dangling
4269 # bdms so we skip the check. Intermittent connection issues
4270 # to cinder should not cause instance reboot to fail.
4271 return
4273 # attachments present in nova DB, ones nova knows about
4274 nova_attachments = []
4275 bdms_to_delete = []
4276 for bdm in bdms.objects:
4277 if bdm.volume_id and bdm.attachment_id: 4277 ↛ 4276line 4277 didn't jump to line 4276 because the condition on line 4277 was always true
4278 try:
4279 self.volume_api.attachment_get(context, bdm.attachment_id)
4280 except exception.VolumeAttachmentNotFound:
4281 LOG.info(
4282 f"Removing stale volume attachment "
4283 f"'{bdm.attachment_id}' from instance for "
4284 f"volume '{bdm.volume_id}'.", instance=instance)
4285 bdm.destroy()
4286 bdms_to_delete.append(bdm)
4287 else:
4288 nova_attachments.append(bdm.attachment_id)
4290 cinder_attachments = [each['id'] for each in cinder_attachments]
4292 if len(set(cinder_attachments) - set(nova_attachments)):
4293 LOG.info(
4294 "Removing stale volume attachments of instance from "
4295 "Cinder", instance=instance)
4296 for each_attach in set(cinder_attachments) - set(nova_attachments):
4297 # delete only cinder known attachments, from cinder DB.
4298 LOG.debug(
4299 f"Removing attachment '{each_attach}'", instance=instance)
4300 self.volume_api.attachment_delete(context, each_attach)
4302 # refresh bdms object
4303 for bdm in bdms_to_delete:
4304 bdms.objects.remove(bdm)
4306 def _get_share_info(self, context, instance, check_status=True):
4307 share_info = objects.ShareMappingList(context)
4309 for share_mapping in objects.ShareMappingList.get_by_instance_uuid(
4310 context, instance.uuid
4311 ):
4312 share_info.objects.append(share_mapping)
4314 if check_status:
4315 fsm = fields.ShareMappingStatus
4316 if (
4317 share_mapping.status == fsm.ATTACHING or
4318 share_mapping.status == fsm.DETACHING
4319 ):
4320 # If the share status is attaching it means we are racing
4321 # with the compute node. The mount is not completed yet or
4322 # something really bad happened. So we set the instance in
4323 # error state.
4324 LOG.error(
4325 "Share id '%s' attached to server id '%s' is "
4326 "still in '%s' state. Setting the instance "
4327 "in error.",
4328 share_mapping.share_id,
4329 instance.id,
4330 share_mapping.status,
4331 )
4332 self._set_instance_obj_error_state(
4333 instance, clean_task_state=True
4334 )
4335 raise exception.ShareErrorUnexpectedStatus(
4336 share_id=share_mapping.share_id,
4337 instance_uuid=instance.id,
4338 )
4340 if share_mapping.status == fsm.ERROR:
4341 LOG.warning(
4342 "Share id '%s' attached to server id '%s' is in "
4343 "error state.",
4344 share_mapping.share_id,
4345 instance.id
4346 )
4348 return share_info
4350 @wrap_exception()
4351 @reverts_task_state
4352 @wrap_instance_event(prefix='compute')
4353 @wrap_instance_fault
4354 def reboot_instance(self, context, instance, block_device_info,
4355 reboot_type):
4356 @utils.synchronized(instance.uuid)
4357 def do_reboot_instance(context, instance, block_device_info,
4358 reboot_type):
4359 self._reboot_instance(context, instance, block_device_info,
4360 reboot_type)
4361 do_reboot_instance(context, instance, block_device_info, reboot_type)
4363 def _reboot_instance(self, context, instance, block_device_info,
4364 reboot_type):
4365 """Reboot an instance on this host."""
4366 # acknowledge the request made it to the manager
4367 if reboot_type == "SOFT":
4368 instance.task_state = task_states.REBOOT_PENDING
4369 expected_states = task_states.soft_reboot_states
4370 else:
4371 instance.task_state = task_states.REBOOT_PENDING_HARD
4372 expected_states = task_states.hard_reboot_states
4374 context = context.elevated()
4375 LOG.info("Rebooting instance", instance=instance)
4377 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
4378 context, instance.uuid)
4380 self._delete_dangling_bdms(context, instance, bdms)
4382 block_device_info = self._get_instance_block_device_info(
4383 context, instance, bdms=bdms)
4385 network_info = self.network_api.get_instance_nw_info(context, instance)
4387 accel_info = self._get_accel_info(context, instance)
4389 share_info = self._get_share_info(context, instance)
4391 self._notify_about_instance_usage(context, instance, "reboot.start")
4392 compute_utils.notify_about_instance_action(
4393 context, instance, self.host,
4394 action=fields.NotificationAction.REBOOT,
4395 phase=fields.NotificationPhase.START,
4396 bdms=bdms
4397 )
4399 instance.power_state = self._get_power_state(instance)
4400 instance.save(expected_task_state=expected_states)
4402 if instance.power_state != power_state.RUNNING:
4403 state = instance.power_state
4404 running = power_state.RUNNING
4405 LOG.warning('trying to reboot a non-running instance:'
4406 ' (state: %(state)s expected: %(running)s)',
4407 {'state': state, 'running': running},
4408 instance=instance)
4410 def bad_volumes_callback(bad_devices):
4411 self._handle_bad_volumes_detached(
4412 context, instance, bad_devices, block_device_info)
4414 try:
4415 # Don't change it out of rescue mode
4416 if instance.vm_state == vm_states.RESCUED: 4416 ↛ 4417line 4416 didn't jump to line 4417 because the condition on line 4416 was never true
4417 new_vm_state = vm_states.RESCUED
4418 else:
4419 new_vm_state = vm_states.ACTIVE
4420 new_power_state = None
4421 if reboot_type == "SOFT":
4422 instance.task_state = task_states.REBOOT_STARTED
4423 expected_state = task_states.REBOOT_PENDING
4424 else:
4425 instance.task_state = task_states.REBOOT_STARTED_HARD
4426 expected_state = task_states.REBOOT_PENDING_HARD
4427 instance.save(expected_task_state=expected_state)
4429 # Attempt to mount the shares again.
4430 # Note: The API ref states that soft reboot can only be
4431 # done if the instance is in ACTIVE state. If the instance
4432 # is in ACTIVE state it cannot have a share_mapping in ERROR
4433 # so it is safe to ignore the re-mounting of the share for
4434 # soft reboot.
4435 if reboot_type == "HARD":
4436 self._mount_all_shares(context, instance, share_info)
4438 self.driver.reboot(context, instance,
4439 network_info,
4440 reboot_type,
4441 block_device_info=block_device_info,
4442 accel_info=accel_info,
4443 share_info=share_info,
4444 bad_volumes_callback=bad_volumes_callback)
4445 share_info.activate_all()
4447 except Exception as error:
4448 with excutils.save_and_reraise_exception() as ctxt:
4449 exc_info = sys.exc_info()
4450 # if the reboot failed but the VM is running don't
4451 # put it into an error state
4452 new_power_state = self._get_power_state(instance)
4453 if new_power_state == power_state.RUNNING:
4454 LOG.warning('Reboot failed but instance is running',
4455 instance=instance)
4456 compute_utils.add_instance_fault_from_exc(context,
4457 instance, error, exc_info)
4458 self._notify_about_instance_usage(context, instance,
4459 'reboot.error', fault=error)
4460 compute_utils.notify_about_instance_action(
4461 context, instance, self.host,
4462 action=fields.NotificationAction.REBOOT,
4463 phase=fields.NotificationPhase.ERROR,
4464 exception=error, bdms=bdms
4465 )
4466 ctxt.reraise = False
4467 else:
4468 LOG.error('Cannot reboot instance: %s', error,
4469 instance=instance)
4470 self._set_instance_obj_error_state(instance)
4472 if not new_power_state:
4473 new_power_state = self._get_power_state(instance)
4474 try:
4475 instance.power_state = new_power_state
4476 instance.vm_state = new_vm_state
4477 instance.task_state = None
4478 instance.save()
4479 except exception.InstanceNotFound:
4480 LOG.warning("Instance disappeared during reboot",
4481 instance=instance)
4483 self._notify_about_instance_usage(context, instance, "reboot.end")
4484 compute_utils.notify_about_instance_action(
4485 context, instance, self.host,
4486 action=fields.NotificationAction.REBOOT,
4487 phase=fields.NotificationPhase.END,
4488 bdms=bdms
4489 )
4491 @delete_image_on_error
4492 def _do_snapshot_instance(self, context, image_id, instance):
4493 self._snapshot_instance(context, image_id, instance,
4494 task_states.IMAGE_BACKUP)
4496 @wrap_exception()
4497 @reverts_task_state
4498 @wrap_instance_event(prefix='compute')
4499 @wrap_instance_fault
4500 def backup_instance(self, context, image_id, instance, backup_type,
4501 rotation):
4502 """Backup an instance on this host.
4504 :param backup_type: daily | weekly
4505 :param rotation: int representing how many backups to keep around
4506 """
4507 self._do_snapshot_instance(context, image_id, instance)
4508 self._rotate_backups(context, instance, backup_type, rotation)
4510 @wrap_exception()
4511 @reverts_task_state
4512 @wrap_instance_event(prefix='compute')
4513 @wrap_instance_fault
4514 @delete_image_on_error
4515 def snapshot_instance(self, context, image_id, instance):
4516 """Snapshot an instance on this host.
4518 :param context: security context
4519 :param image_id: glance.db.sqlalchemy.models.Image.Id
4520 :param instance: a nova.objects.instance.Instance object
4521 """
4522 # NOTE(dave-mcnally) the task state will already be set by the api
4523 # but if the compute manager has crashed/been restarted prior to the
4524 # request getting here the task state may have been cleared so we set
4525 # it again and things continue normally
4526 try:
4527 instance.task_state = task_states.IMAGE_SNAPSHOT
4528 instance.save(
4529 expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
4530 except exception.InstanceNotFound:
4531 # possibility instance no longer exists, no point in continuing
4532 LOG.debug("Instance not found, could not set state %s "
4533 "for instance.",
4534 task_states.IMAGE_SNAPSHOT, instance=instance)
4535 return
4537 except exception.UnexpectedDeletingTaskStateError:
4538 LOG.debug("Instance being deleted, snapshot cannot continue",
4539 instance=instance)
4540 return
4542 with self._snapshot_semaphore:
4543 self._snapshot_instance(context, image_id, instance,
4544 task_states.IMAGE_SNAPSHOT)
4546 def _snapshot_instance(self, context, image_id, instance,
4547 expected_task_state):
4548 context = context.elevated()
4550 instance.power_state = self._get_power_state(instance)
4551 try:
4552 instance.save()
4554 LOG.info('instance snapshotting', instance=instance)
4556 if instance.power_state != power_state.RUNNING: 4556 ↛ 4557line 4556 didn't jump to line 4557 because the condition on line 4556 was never true
4557 state = instance.power_state
4558 running = power_state.RUNNING
4559 LOG.warning('trying to snapshot a non-running instance: '
4560 '(state: %(state)s expected: %(running)s)',
4561 {'state': state, 'running': running},
4562 instance=instance)
4564 self._notify_about_instance_usage(
4565 context, instance, "snapshot.start")
4566 compute_utils.notify_about_instance_snapshot(context, instance,
4567 self.host, phase=fields.NotificationPhase.START,
4568 snapshot_image_id=image_id)
4570 def update_task_state(task_state,
4571 expected_state=expected_task_state):
4572 instance.task_state = task_state
4573 instance.save(expected_task_state=expected_state)
4575 with timeutils.StopWatch() as timer:
4576 self.driver.snapshot(context, instance, image_id,
4577 update_task_state)
4578 LOG.info('Took %0.2f seconds to snapshot the instance on '
4579 'the hypervisor.', timer.elapsed(), instance=instance)
4581 instance.task_state = None
4582 instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
4584 self._notify_about_instance_usage(context, instance,
4585 "snapshot.end")
4586 compute_utils.notify_about_instance_snapshot(context, instance,
4587 self.host, phase=fields.NotificationPhase.END,
4588 snapshot_image_id=image_id)
4589 except (exception.InstanceNotFound,
4590 exception.InstanceNotRunning,
4591 exception.UnexpectedDeletingTaskStateError):
4592 # the instance got deleted during the snapshot
4593 # Quickly bail out of here
4594 msg = 'Instance disappeared during snapshot'
4595 LOG.debug(msg, instance=instance)
4596 try:
4597 image = self.image_api.get(context, image_id)
4598 if image['status'] != 'active':
4599 self.image_api.delete(context, image_id)
4600 except exception.ImageNotFound:
4601 LOG.debug('Image not found during clean up %s', image_id)
4602 except Exception:
4603 LOG.warning("Error while trying to clean up image %s",
4604 image_id, instance=instance)
4605 except exception.ImageNotFound:
4606 instance.task_state = None
4607 instance.save()
4608 LOG.warning("Image not found during snapshot", instance=instance)
4610 @messaging.expected_exceptions(NotImplementedError)
4611 @wrap_exception()
4612 def volume_snapshot_create(self, context, instance, volume_id,
4613 create_info):
4614 try:
4615 self.driver.volume_snapshot_create(context, instance, volume_id,
4616 create_info)
4617 except exception.InstanceNotRunning:
4618 # Libvirt driver can raise this exception
4619 LOG.debug('Instance disappeared during volume snapshot create',
4620 instance=instance)
4622 @messaging.expected_exceptions(NotImplementedError)
4623 @wrap_exception()
4624 def volume_snapshot_delete(self, context, instance, volume_id,
4625 snapshot_id, delete_info):
4626 try:
4627 self.driver.volume_snapshot_delete(context, instance, volume_id,
4628 snapshot_id, delete_info)
4629 except exception.InstanceNotRunning:
4630 # Libvirt driver can raise this exception
4631 LOG.debug('Instance disappeared during volume snapshot delete',
4632 instance=instance)
4634 @messaging.expected_exceptions(NotImplementedError)
4635 @wrap_exception()
4636 @wrap_instance_event(prefix='compute')
4637 @wrap_instance_fault
4638 def allow_share(self, context, instance, share_mapping):
4640 @utils.synchronized(share_mapping.share_id)
4641 def _allow_share(context, instance, share_mapping):
4642 def _apply_policy():
4643 # self.manila_api.lock(share_mapping.share_id)
4644 # Explicitly locking the share is not needed as
4645 # create_access_rule() from the sdk will do it if the
4646 # lock_visibility and lock_deletion flags are passed
4647 self.manila_api.allow(
4648 context,
4649 share_mapping.share_id,
4650 share_mapping.access_type,
4651 share_mapping.access_to,
4652 "rw",
4653 )
4655 def _wait_policy_to_be_applied():
4656 # Ensure the share policy is updated, this will avoid
4657 # a race condition mounting the share if it is not the case.
4658 max_retries = CONF.manila.share_apply_policy_timeout
4659 attempt_count = 0
4660 while attempt_count < max_retries: 4660 ↛ 4680line 4660 didn't jump to line 4680 because the condition on line 4660 was always true
4661 if self.manila_api.has_access( 4661 ↛ 4673line 4661 didn't jump to line 4673 because the condition on line 4661 was always true
4662 context,
4663 share_mapping.share_id,
4664 share_mapping.access_type,
4665 share_mapping.access_to,
4666 ):
4667 LOG.debug(
4668 "Allow policy set on share %s ",
4669 share_mapping.share_id,
4670 )
4671 break
4672 else:
4673 LOG.debug(
4674 "Waiting policy to be set on share %s ",
4675 share_mapping.share_id,
4676 )
4677 time.sleep(1)
4678 attempt_count += 1
4680 if attempt_count >= max_retries: 4680 ↛ 4681line 4680 didn't jump to line 4681 because the condition on line 4680 was never true
4681 raise exception.ShareAccessGrantError(
4682 share_id=share_mapping.share_id,
4683 reason="Failed to set allow policy on share, "
4684 "too many retries",
4685 )
4687 try:
4688 compute_utils.notify_about_share_attach_detach(
4689 context,
4690 instance,
4691 instance.host,
4692 action=fields.NotificationAction.SHARE_ATTACH,
4693 phase=fields.NotificationPhase.START,
4694 share_id=share_mapping.share_id
4695 )
4697 share_mapping.set_access_according_to_protocol()
4699 if not self.manila_api.has_access( 4699 ↛ 4709line 4699 didn't jump to line 4709 because the condition on line 4699 was always true
4700 context,
4701 share_mapping.share_id,
4702 share_mapping.access_type,
4703 share_mapping.access_to,
4704 ):
4705 _apply_policy()
4706 _wait_policy_to_be_applied()
4708 # Set the share from attaching to inactive
4709 self._set_share_mapping_status(
4710 share_mapping, fields.ShareMappingStatus.INACTIVE
4711 )
4713 compute_utils.notify_about_share_attach_detach(
4714 context,
4715 instance,
4716 instance.host,
4717 action=fields.NotificationAction.SHARE_ATTACH,
4718 phase=fields.NotificationPhase.END,
4719 share_id=share_mapping.share_id
4720 )
4722 except (
4723 exception.ShareNotFound,
4724 exception.ShareProtocolNotSupported,
4725 exception.ShareAccessGrantError,
4726 ) as e:
4727 self._set_share_mapping_status(
4728 share_mapping, fields.ShareMappingStatus.ERROR
4729 )
4730 compute_utils.notify_about_share_attach_detach(
4731 context,
4732 instance,
4733 instance.host,
4734 action=fields.NotificationAction.SHARE_ATTACH,
4735 phase=fields.NotificationPhase.ERROR,
4736 share_id=share_mapping.share_id,
4737 exception=e
4738 )
4739 LOG.error(e.format_message())
4740 raise
4741 except (
4742 sdk_exc.BadRequestException,
4743 ) as e:
4744 self._set_share_mapping_status(
4745 share_mapping, fields.ShareMappingStatus.ERROR
4746 )
4747 compute_utils.notify_about_share_attach_detach(
4748 context,
4749 instance,
4750 instance.host,
4751 action=fields.NotificationAction.SHARE_ATTACH,
4752 phase=fields.NotificationPhase.ERROR,
4753 share_id=share_mapping.share_id,
4754 exception=e
4755 )
4756 LOG.error(
4757 "%s: %s error from url: %s, %s",
4758 e.message,
4759 e.source,
4760 e.url,
4761 e.details,
4762 )
4763 raise
4764 except keystone_exception.http.Unauthorized as e:
4765 self._set_share_mapping_status(
4766 share_mapping, fields.ShareMappingStatus.ERROR
4767 )
4768 compute_utils.notify_about_share_attach_detach(
4769 context,
4770 instance,
4771 instance.host,
4772 action=fields.NotificationAction.SHARE_ATTACH,
4773 phase=fields.NotificationPhase.ERROR,
4774 share_id=share_mapping.share_id,
4775 exception=e
4776 )
4777 LOG.error(e)
4778 raise
4780 _allow_share(context, instance, share_mapping)
4782 @messaging.expected_exceptions(NotImplementedError)
4783 @wrap_exception()
4784 @wrap_instance_event(prefix='compute')
4785 @wrap_instance_fault
4786 def deny_share(self, context, instance, share_mapping):
4788 @utils.synchronized(share_mapping.share_id)
4789 def _deny_share(context, instance, share_mapping):
4791 def check_share_usage(context, instance_uuid):
4792 share_mappings_used_by_share = (
4793 objects.share_mapping.ShareMappingList.get_by_share_id(
4794 context, share_mapping.share_id
4795 )
4796 )
4798 # Logic explanation:
4799 #
4800 # Warning: Here we have a list of share_mapping using our
4801 # share (usually share_mappings is a list of share_mapping used
4802 # by an instance).
4803 # A share IS NOT used (detachable) if:
4804 # - The share status is INACTIVE or ERROR on our instance.
4805 # - The share status is DETACHING on all other instances.
4806 # +-- reverse the logic as the function check if a share
4807 # | IS used.
4808 # v
4809 return not all(
4810 (
4811 (
4812 sm.instance_uuid == instance_uuid and
4813 (
4814 sm.status
4815 in (
4816 fields.ShareMappingStatus.INACTIVE,
4817 fields.ShareMappingStatus.ERROR,
4818 )
4819 )
4820 ) or
4821 sm.status == fields.ShareMappingStatus.DETACHING
4822 )
4823 for sm in share_mappings_used_by_share
4824 )
4826 try:
4827 compute_utils.notify_about_share_attach_detach(
4828 context,
4829 instance,
4830 instance.host,
4831 action=fields.NotificationAction.SHARE_DETACH,
4832 phase=fields.NotificationPhase.START,
4833 share_id=share_mapping.share_id,
4834 )
4836 still_used = check_share_usage(context, instance.uuid)
4838 share_mapping.set_access_according_to_protocol()
4840 if not still_used:
4841 # self.manila_api.unlock(share_mapping.share_id)
4842 # Explicit unlocking the share is not needed as
4843 # delete_access_rule() from the sdk will do it if the
4844 # "unrestrict" parameter is passed
4845 self.manila_api.deny(
4846 context,
4847 share_mapping.share_id,
4848 share_mapping.access_type,
4849 share_mapping.access_to,
4850 )
4852 share_mapping.delete()
4854 compute_utils.notify_about_share_attach_detach(
4855 context,
4856 instance,
4857 instance.host,
4858 action=fields.NotificationAction.SHARE_DETACH,
4859 phase=fields.NotificationPhase.END,
4860 share_id=share_mapping.share_id,
4861 )
4863 except (
4864 exception.ShareAccessRemovalError,
4865 exception.ShareProtocolNotSupported,
4866 ) as e:
4867 self._set_share_mapping_status(
4868 share_mapping, fields.ShareMappingStatus.ERROR
4869 )
4870 compute_utils.notify_about_share_attach_detach(
4871 context,
4872 instance,
4873 instance.host,
4874 action=fields.NotificationAction.SHARE_DETACH,
4875 phase=fields.NotificationPhase.ERROR,
4876 share_id=share_mapping.share_id,
4877 exception=e
4878 )
4879 LOG.error(e.format_message())
4880 raise
4881 except keystone_exception.http.Unauthorized as e:
4882 self._set_share_mapping_status(
4883 share_mapping, fields.ShareMappingStatus.ERROR
4884 )
4885 compute_utils.notify_about_share_attach_detach(
4886 context,
4887 instance,
4888 instance.host,
4889 action=fields.NotificationAction.SHARE_DETACH,
4890 phase=fields.NotificationPhase.ERROR,
4891 share_id=share_mapping.share_id,
4892 exception=e
4893 )
4894 LOG.error(e)
4895 raise
4896 except (exception.ShareNotFound, exception.ShareAccessNotFound):
4897 # Ignore the error if for any reason there is nothing to
4898 # remove from manila, so we can still detach the share.
4899 share_mapping.delete()
4900 compute_utils.notify_about_share_attach_detach(
4901 context,
4902 instance,
4903 instance.host,
4904 action=fields.NotificationAction.SHARE_DETACH,
4905 phase=fields.NotificationPhase.END,
4906 share_id=share_mapping.share_id,
4907 )
4909 _deny_share(context, instance, share_mapping)
4911 @wrap_exception()
4912 def _mount_all_shares(self, context, instance, share_info):
4913 for share_mapping in share_info:
4914 self._mount_share(context, instance, share_mapping)
4916 @wrap_exception()
4917 def _umount_all_shares(self, context, instance, share_info):
4918 for share_mapping in share_info:
4919 self._umount_share(context, instance, share_mapping)
4921 @wrap_exception()
4922 def _mount_share(self, context, instance, share_mapping):
4924 @utils.synchronized(share_mapping.share_id)
4925 def _mount_share(context, instance, share_mapping):
4926 try:
4927 share_mapping.set_access_according_to_protocol()
4929 if share_mapping.share_proto == (
4930 fields.ShareMappingProto.CEPHFS):
4931 share_mapping.enhance_with_ceph_credentials(context)
4933 LOG.debug("Mounting share %s", share_mapping.share_id)
4934 self.driver.mount_share(context, instance, share_mapping)
4936 except (
4937 exception.ShareNotFound,
4938 exception.ShareProtocolNotSupported,
4939 exception.ShareMountError,
4940 ) as e:
4941 self._set_share_mapping_and_instance_in_error(
4942 instance, share_mapping
4943 )
4944 LOG.error(e.format_message())
4945 raise
4946 except (sdk_exc.BadRequestException) as e:
4947 self._set_share_mapping_and_instance_in_error(
4948 instance, share_mapping
4949 )
4950 LOG.error("%s: %s error from url: %s, %s", e.message, e.source,
4951 e.url, e.details)
4952 raise
4954 _mount_share(context, instance, share_mapping)
4956 @wrap_exception()
4957 def _umount_share(self, context, instance, share_mapping):
4959 @utils.synchronized(share_mapping.share_id)
4960 def _umount_share(context, instance, share_mapping):
4961 try:
4962 share_mapping.set_access_according_to_protocol()
4964 if share_mapping.share_proto == (
4965 fields.ShareMappingProto.CEPHFS):
4966 share_mapping.enhance_with_ceph_credentials(context)
4968 self.driver.umount_share(context, instance, share_mapping)
4970 except (
4971 exception.ShareNotFound,
4972 exception.ShareUmountError,
4973 exception.ShareProtocolNotSupported,
4974 ) as e:
4975 self._set_share_mapping_and_instance_in_error(
4976 instance, share_mapping
4977 )
4978 LOG.error(e.format_message())
4979 raise
4981 _umount_share(context, instance, share_mapping)
4983 def _set_share_mapping_status(self, share_mapping, status):
4984 share_mapping.status = status
4985 share_mapping.save()
4987 def _set_share_mapping_and_instance_in_error(
4988 self, instance, share_mapping
4989 ):
4990 share_mapping.status = fields.ShareMappingStatus.ERROR
4991 share_mapping.save()
4992 self._set_instance_obj_error_state(
4993 instance, clean_task_state=True
4994 )
4996 @wrap_instance_fault
4997 def _rotate_backups(self, context, instance, backup_type, rotation):
4998 """Delete excess backups associated to an instance.
5000 Instances are allowed a fixed number of backups (the rotation number);
5001 this method deletes the oldest backups that exceed the rotation
5002 threshold.
5004 :param context: security context
5005 :param instance: Instance dict
5006 :param backup_type: a user-defined type, like "daily" or "weekly" etc.
5007 :param rotation: int representing how many backups to keep around;
5008 None if rotation shouldn't be used (as in the case of snapshots)
5009 """
5010 filters = {'property-image_type': 'backup',
5011 'property-backup_type': backup_type,
5012 'property-instance_uuid': instance.uuid}
5014 images = self.image_api.get_all(context, filters=filters,
5015 sort_key='created_at', sort_dir='desc')
5016 num_images = len(images)
5017 LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
5018 {'num_images': num_images, 'rotation': rotation},
5019 instance=instance)
5021 if num_images > rotation: 5021 ↛ exitline 5021 didn't return from function '_rotate_backups' because the condition on line 5021 was always true
5022 # NOTE(sirp): this deletes all backups that exceed the rotation
5023 # limit
5024 excess = len(images) - rotation
5025 LOG.debug("Rotating out %d backups", excess,
5026 instance=instance)
5027 for i in range(excess):
5028 image = images.pop()
5029 image_id = image['id']
5030 LOG.debug("Deleting image %s", image_id,
5031 instance=instance)
5032 try:
5033 self.image_api.delete(context, image_id)
5034 except exception.ImageNotFound:
5035 LOG.info("Failed to find image %(image_id)s to "
5036 "delete", {'image_id': image_id},
5037 instance=instance)
5038 except (exception.ImageDeleteConflict, Exception) as exc:
5039 LOG.info("Failed to delete image %(image_id)s during "
5040 "deleting excess backups. "
5041 "Continuing for next image.. %(exc)s",
5042 {'image_id': image_id, 'exc': exc},
5043 instance=instance)
5045 @wrap_exception()
5046 @reverts_task_state
5047 @wrap_instance_event(prefix='compute')
5048 @wrap_instance_fault
5049 def set_admin_password(self, context, instance, new_pass):
5050 """Set the root/admin password for an instance on this host.
5052 This is generally only called by API password resets after an
5053 image has been built.
5055 @param context: Nova auth context.
5056 @param instance: Nova instance object.
5057 @param new_pass: The admin password for the instance.
5058 """
5060 context = context.elevated()
5061 current_power_state = self._get_power_state(instance)
5062 expected_state = power_state.RUNNING
5064 if current_power_state != expected_state:
5065 instance.task_state = None
5066 instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
5067 _msg = _('instance %s is not running') % instance.uuid
5068 raise exception.InstancePasswordSetFailed(
5069 instance=instance.uuid, reason=_msg)
5071 try:
5072 self.driver.set_admin_password(instance, new_pass)
5073 LOG.info("Admin password set", instance=instance)
5074 instance.task_state = None
5075 instance.save(
5076 expected_task_state=task_states.UPDATING_PASSWORD)
5077 except exception.InstanceAgentNotEnabled:
5078 with excutils.save_and_reraise_exception():
5079 LOG.debug('Guest agent is not enabled for the instance.',
5080 instance=instance)
5081 instance.task_state = None
5082 instance.save(
5083 expected_task_state=task_states.UPDATING_PASSWORD)
5084 except exception.SetAdminPasswdNotSupported:
5085 with excutils.save_and_reraise_exception():
5086 LOG.info('set_admin_password is not supported '
5087 'by this driver or guest instance.',
5088 instance=instance)
5089 instance.task_state = None
5090 instance.save(
5091 expected_task_state=task_states.UPDATING_PASSWORD)
5092 except NotImplementedError:
5093 LOG.warning('set_admin_password is not implemented '
5094 'by this driver or guest instance.',
5095 instance=instance)
5096 instance.task_state = None
5097 instance.save(
5098 expected_task_state=task_states.UPDATING_PASSWORD)
5099 raise NotImplementedError(_('set_admin_password is not '
5100 'implemented by this driver or guest '
5101 'instance.'))
5102 except exception.UnexpectedTaskStateError:
5103 # interrupted by another (most likely delete) task
5104 # do not retry
5105 raise
5106 except Exception:
5107 # Catch all here because this could be anything.
5108 LOG.exception('set_admin_password failed', instance=instance)
5109 # We create a new exception here so that we won't
5110 # potentially reveal password information to the
5111 # API caller. The real exception is logged above
5112 _msg = _('error setting admin password')
5113 raise exception.InstancePasswordSetFailed(
5114 instance=instance.uuid, reason=_msg)
5116 def _get_rescue_image(self, context, instance, rescue_image_ref=None):
5117 """Determine what image should be used to boot the rescue VM."""
5118 # 1. If rescue_image_ref is passed in, use that for rescue.
5119 # 2. Else, use the base image associated with instance's current image.
5120 # The idea here is to provide the customer with a rescue
5121 # environment which they are familiar with.
5122 # So, if they built their instance off of a Debian image,
5123 # their rescue VM will also be Debian.
5124 # 3. As a last resort, use instance's current image.
5125 if not rescue_image_ref:
5126 system_meta = utils.instance_sys_meta(instance)
5127 rescue_image_ref = system_meta.get('image_base_image_ref')
5129 if not rescue_image_ref:
5130 LOG.warning('Unable to find a different image to use for '
5131 'rescue VM, using instance\'s current image',
5132 instance=instance)
5133 rescue_image_ref = instance.image_ref
5135 return objects.ImageMeta.from_image_ref(
5136 context, self.image_api, rescue_image_ref)
5138 @wrap_exception()
5139 @reverts_task_state
5140 @wrap_instance_event(prefix='compute')
5141 @wrap_instance_fault
5142 def rescue_instance(self, context, instance, rescue_password,
5143 rescue_image_ref, clean_shutdown):
5144 context = context.elevated()
5145 LOG.info('Rescuing', instance=instance)
5147 admin_password = (rescue_password if rescue_password else
5148 utils.generate_password())
5150 network_info = self.network_api.get_instance_nw_info(context, instance)
5152 rescue_image_meta = self._get_rescue_image(context, instance,
5153 rescue_image_ref)
5155 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
5156 context, instance.uuid)
5157 block_device_info = self._get_instance_block_device_info(
5158 context, instance, bdms=bdms)
5160 share_info = self._get_share_info(context, instance)
5162 extra_usage_info = {'rescue_image_name':
5163 self._get_image_name(rescue_image_meta)}
5164 self._notify_about_instance_usage(context, instance,
5165 "rescue.start", extra_usage_info=extra_usage_info,
5166 network_info=network_info)
5167 compute_utils.notify_about_instance_rescue_action(
5168 context, instance, self.host, rescue_image_ref,
5169 phase=fields.NotificationPhase.START)
5171 try:
5172 self._power_off_instance(context, instance, clean_shutdown)
5174 self._mount_all_shares(context, instance, share_info)
5176 self.driver.rescue(context, instance, network_info,
5177 rescue_image_meta, admin_password,
5178 block_device_info, share_info)
5179 except Exception as e:
5180 LOG.exception("Error trying to Rescue Instance",
5181 instance=instance)
5182 self._set_instance_obj_error_state(instance)
5183 raise exception.InstanceNotRescuable(
5184 instance_id=instance.uuid,
5185 reason=_("Driver Error: %s") % e)
5187 compute_utils.notify_usage_exists(self.notifier, context, instance,
5188 self.host, current_period=True)
5190 instance.vm_state = vm_states.RESCUED
5191 instance.task_state = None
5192 instance.power_state = self._get_power_state(instance)
5193 instance.launched_at = timeutils.utcnow()
5194 instance.save(expected_task_state=task_states.RESCUING)
5196 self._notify_about_instance_usage(context, instance,
5197 "rescue.end", extra_usage_info=extra_usage_info,
5198 network_info=network_info)
5199 compute_utils.notify_about_instance_rescue_action(
5200 context, instance, self.host, rescue_image_ref,
5201 phase=fields.NotificationPhase.END)
5203 @wrap_exception()
5204 @reverts_task_state
5205 @wrap_instance_event(prefix='compute')
5206 @wrap_instance_fault
5207 def unrescue_instance(self, context, instance):
5208 orig_context = context
5209 context = context.elevated()
5210 LOG.info('Unrescuing', instance=instance)
5212 network_info = self.network_api.get_instance_nw_info(context, instance)
5213 self._notify_about_instance_usage(context, instance,
5214 "unrescue.start", network_info=network_info)
5215 compute_utils.notify_about_instance_action(context, instance,
5216 self.host, action=fields.NotificationAction.UNRESCUE,
5217 phase=fields.NotificationPhase.START)
5219 with self._error_out_instance_on_exception(context, instance):
5220 self.driver.unrescue(orig_context, instance)
5222 instance.vm_state = vm_states.ACTIVE
5223 instance.task_state = None
5224 instance.power_state = self._get_power_state(instance)
5225 instance.save(expected_task_state=task_states.UNRESCUING)
5227 self._notify_about_instance_usage(context,
5228 instance,
5229 "unrescue.end",
5230 network_info=network_info)
5231 compute_utils.notify_about_instance_action(context, instance,
5232 self.host, action=fields.NotificationAction.UNRESCUE,
5233 phase=fields.NotificationPhase.END)
5235 @wrap_exception()
5236 @wrap_instance_event(prefix='compute')
5237 @errors_out_migration
5238 @wrap_instance_fault
5239 def confirm_resize(self, context, instance, migration):
5240 """Confirms a migration/resize and deletes the 'old' instance.
5242 This is called from the API and runs on the source host.
5244 Nothing needs to happen on the destination host at this point since
5245 the instance is already running there. This routine just cleans up the
5246 source host.
5247 """
5248 @utils.synchronized(instance.uuid)
5249 def do_confirm_resize(context, instance, migration):
5250 LOG.debug("Going to confirm migration %s", migration.id,
5251 instance=instance)
5253 if migration.status == 'confirmed':
5254 LOG.info("Migration %s is already confirmed",
5255 migration.id, instance=instance)
5256 return
5258 if migration.status not in ('finished', 'confirming'):
5259 LOG.warning("Unexpected confirmation status '%(status)s' "
5260 "of migration %(id)s, exit confirmation process",
5261 {"status": migration.status, "id": migration.id},
5262 instance=instance)
5263 return
5265 # NOTE(wangpan): Get the instance from db, if it has been
5266 # deleted, we do nothing and return here
5267 expected_attrs = ['metadata', 'system_metadata', 'flavor']
5268 try:
5269 instance = objects.Instance.get_by_uuid(
5270 context, instance.uuid,
5271 expected_attrs=expected_attrs)
5272 except exception.InstanceNotFound:
5273 LOG.info("Instance is not found during confirmation",
5274 instance=instance)
5275 return
5277 with self._error_out_instance_on_exception(context, instance):
5278 try:
5279 self._confirm_resize(
5280 context, instance, migration=migration)
5281 except Exception:
5282 # Something failed when cleaning up the source host so
5283 # log a traceback and leave a hint about hard rebooting
5284 # the server to correct its state in the DB.
5285 with excutils.save_and_reraise_exception(logger=LOG):
5286 LOG.exception(
5287 'Confirm resize failed on source host %s. '
5288 'Resource allocations in the placement service '
5289 'will be removed regardless because the instance '
5290 'is now on the destination host %s. You can try '
5291 'hard rebooting the instance to correct its '
5292 'state.', self.host, migration.dest_compute,
5293 instance=instance)
5294 finally:
5295 # Whether an error occurred or not, at this point the
5296 # instance is on the dest host. Avoid leaking allocations
5297 # in placement by deleting them here...
5298 self._delete_allocation_after_move(
5299 context, instance, migration)
5300 # ...inform the scheduler about the move...
5301 self._delete_scheduler_instance_info(
5302 context, instance.uuid)
5303 # ...and unset the cached flavor information (this is done
5304 # last since the resource tracker relies on it for its
5305 # periodic tasks)
5306 self._delete_stashed_flavor_info(instance)
5308 do_confirm_resize(context, instance, migration)
5310 def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping):
5311 # NOTE(adrianc): This method returns a copy of nw_info if modifications
5312 # are made else it returns the original nw_info.
5313 updated_nw_info = nw_info
5314 if nw_info and pci_mapping:
5315 updated_nw_info = copy.deepcopy(nw_info)
5316 for vif in updated_nw_info:
5317 if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV:
5318 try:
5319 vif_pci_addr = vif['profile']['pci_slot']
5320 new_addr = pci_mapping[vif_pci_addr].address
5321 vif['profile']['pci_slot'] = new_addr
5322 LOG.debug("Updating VIF's PCI address for VIF %(id)s. "
5323 "Original value %(orig_val)s, "
5324 "new value %(new_val)s",
5325 {'id': vif['id'],
5326 'orig_val': vif_pci_addr,
5327 'new_val': new_addr})
5328 except (KeyError, AttributeError):
5329 with excutils.save_and_reraise_exception():
5330 # NOTE(adrianc): This should never happen. If we
5331 # get here it means there is some inconsistency
5332 # with either 'nw_info' or 'pci_mapping'.
5333 LOG.error("Unexpected error when updating network "
5334 "information with PCI mapping.")
5335 return updated_nw_info
5337 def _confirm_resize(self, context, instance, migration=None):
5338 """Destroys the source instance."""
5339 self._notify_about_instance_usage(context, instance,
5340 "resize.confirm.start")
5341 compute_utils.notify_about_instance_action(context, instance,
5342 self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
5343 phase=fields.NotificationPhase.START)
5345 # NOTE(tr3buchet): tear down networks on source host
5346 self.network_api.setup_networks_on_host(context, instance,
5347 migration.source_compute, teardown=True)
5349 # TODO(stephenfin): These next three calls should be bundled
5350 network_info = self.network_api.get_instance_nw_info(context,
5351 instance)
5353 # NOTE(adrianc): Populate old PCI device in VIF profile
5354 # to allow virt driver to properly unplug it from Hypervisor.
5355 pci_mapping = (instance.migration_context.
5356 get_pci_mapping_for_migration(True))
5357 network_info = self._get_updated_nw_info_with_pci_mapping(
5358 network_info, pci_mapping)
5360 self.driver.confirm_migration(context, migration, instance,
5361 network_info)
5363 # Free up the old_flavor usage from the resource tracker for this host.
5364 self.rt.drop_move_claim_at_source(context, instance, migration)
5366 # NOTE(mriedem): The old_vm_state could be STOPPED but the user
5367 # might have manually powered up the instance to confirm the
5368 # resize/migrate, so we need to check the current power state
5369 # on the instance and set the vm_state appropriately. We default
5370 # to ACTIVE because if the power state is not SHUTDOWN, we
5371 # assume _sync_instance_power_state will clean it up.
5372 p_state = instance.power_state
5373 vm_state = None
5374 if p_state == power_state.SHUTDOWN:
5375 vm_state = vm_states.STOPPED
5376 LOG.debug("Resized/migrated instance is powered off. "
5377 "Setting vm_state to '%s'.", vm_state,
5378 instance=instance)
5379 else:
5380 vm_state = vm_states.ACTIVE
5382 instance.vm_state = vm_state
5383 instance.task_state = None
5384 instance.save(expected_task_state=[None, task_states.DELETING,
5385 task_states.SOFT_DELETING])
5387 self._notify_about_instance_usage(
5388 context, instance, "resize.confirm.end",
5389 network_info=network_info)
5390 compute_utils.notify_about_instance_action(context, instance,
5391 self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
5392 phase=fields.NotificationPhase.END)
5394 def _delete_allocation_after_move(self, context, instance, migration):
5395 """Deletes resource allocations held by the migration record against
5396 the source compute node resource provider after a confirmed cold /
5397 successful live migration.
5398 """
5399 try:
5400 # NOTE(danms): We're finishing on the source node, so try
5401 # to delete the allocation based on the migration uuid
5402 self.reportclient.delete_allocation_for_instance(
5403 context, migration.uuid, consumer_type='migration',
5404 force=False)
5405 except exception.AllocationDeleteFailed:
5406 LOG.error('Deleting allocation in placement for migration '
5407 '%(migration_uuid)s failed. The instance '
5408 '%(instance_uuid)s will be put to ERROR state '
5409 'but the allocation held by the migration is '
5410 'leaked.',
5411 {'instance_uuid': instance.uuid,
5412 'migration_uuid': migration.uuid})
5413 raise
5415 def _delete_stashed_flavor_info(self, instance):
5416 """Remove information about the flavor change after a resize."""
5417 instance.old_flavor = None
5418 instance.new_flavor = None
5419 instance.system_metadata.pop('old_vm_state', None)
5420 instance.save()
5422 @wrap_exception()
5423 @wrap_instance_event(prefix='compute')
5424 @errors_out_migration
5425 @wrap_instance_fault
5426 def confirm_snapshot_based_resize_at_source(
5427 self, ctxt, instance, migration):
5428 """Confirms a snapshot-based resize on the source host.
5430 Cleans the guest from the source hypervisor including disks and drops
5431 the MoveClaim which will free up "old_flavor" usage from the
5432 ResourceTracker.
5434 Deletes the allocations held by the migration consumer against the
5435 source compute node resource provider.
5437 :param ctxt: nova auth request context targeted at the source cell
5438 :param instance: Instance object being resized which should have the
5439 "old_flavor" attribute set
5440 :param migration: Migration object for the resize operation
5441 """
5443 @utils.synchronized(instance.uuid)
5444 def do_confirm():
5445 LOG.info('Confirming resize on source host.', instance=instance)
5446 with self._error_out_instance_on_exception(ctxt, instance):
5447 # TODO(mriedem): Could probably make this try/except/finally
5448 # a context manager to share with confirm_resize().
5449 try:
5450 self._confirm_snapshot_based_resize_at_source(
5451 ctxt, instance, migration)
5452 except Exception:
5453 # Something failed when cleaning up the source host so
5454 # log a traceback and leave a hint about hard rebooting
5455 # the server to correct its state in the DB.
5456 with excutils.save_and_reraise_exception(logger=LOG):
5457 LOG.exception(
5458 'Confirm resize failed on source host %s. '
5459 'Resource allocations in the placement service '
5460 'will be removed regardless because the instance '
5461 'is now on the destination host %s. You can try '
5462 'hard rebooting the instance to correct its '
5463 'state.', self.host, migration.dest_compute,
5464 instance=instance)
5465 finally:
5466 # Whether an error occurred or not, at this point the
5467 # instance is on the dest host so to avoid leaking
5468 # allocations in placement, delete them here.
5469 # TODO(mriedem): Should we catch and just log
5470 # AllocationDeleteFailed? What is the user's recourse if
5471 # we got this far but this fails? At this point the
5472 # instance is on the target host and the allocations
5473 # could just be manually cleaned up by the operator.
5474 self._delete_allocation_after_move(ctxt, instance,
5475 migration)
5476 do_confirm()
5478 def _confirm_snapshot_based_resize_at_source(
5479 self, ctxt, instance, migration):
5480 """Private version of confirm_snapshot_based_resize_at_source
5482 This allows the main method to be decorated with error handlers.
5484 :param ctxt: nova auth request context targeted at the source cell
5485 :param instance: Instance object being resized which should have the
5486 "old_flavor" attribute set
5487 :param migration: Migration object for the resize operation
5488 """
5489 # Cleanup the guest from the hypervisor including local disks.
5490 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
5491 LOG.debug('Cleaning up guest from source hypervisor including disks.',
5492 instance=instance)
5494 # FIXME(mriedem): Per bug 1809095, _confirm_resize calls
5495 # _get_updated_nw_info_with_pci_mapping here prior to unplugging
5496 # VIFs on the source, but in our case we have already unplugged
5497 # VIFs during prep_snapshot_based_resize_at_source, so what do we
5498 # need to do about those kinds of ports? Do we need to wait to unplug
5499 # VIFs until confirm like normal resize?
5501 # Note that prep_snapshot_based_resize_at_source already destroyed the
5502 # guest which disconnected volumes and unplugged VIFs but did not
5503 # destroy disks in case something failed during the resize and the
5504 # instance needed to be rebooted or rebuilt on the source host. Now
5505 # that we are confirming the resize we want to cleanup the disks left
5506 # on the source host. We call cleanup() instead of destroy() to avoid
5507 # any InstanceNotFound confusion from the driver since the guest was
5508 # already destroyed on this host. block_device_info=None and
5509 # destroy_vifs=False means cleanup() will not try to disconnect volumes
5510 # or unplug VIFs.
5511 self.driver.cleanup(
5512 ctxt, instance, network_info, block_device_info=None,
5513 destroy_disks=True, destroy_vifs=False)
5515 # Delete port bindings for the source host.
5516 self._confirm_snapshot_based_resize_delete_port_bindings(
5517 ctxt, instance)
5519 # Delete volume attachments for the source host.
5520 self._delete_volume_attachments(ctxt, instance.get_bdms())
5522 # Free up the old_flavor usage from the resource tracker for this host.
5523 self.rt.drop_move_claim_at_source(ctxt, instance, migration)
5525 def _confirm_snapshot_based_resize_delete_port_bindings(
5526 self, ctxt, instance):
5527 """Delete port bindings for the source host when confirming
5528 snapshot-based resize on the source host."
5530 :param ctxt: nova auth RequestContext
5531 :param instance: Instance object that was resized/cold migrated
5532 """
5533 LOG.debug('Deleting port bindings for source host.',
5534 instance=instance)
5535 try:
5536 self.network_api.cleanup_instance_network_on_host(
5537 ctxt, instance, self.host)
5538 except exception.PortBindingDeletionFailed as e:
5539 # Do not let this stop us from cleaning up since the guest
5540 # is already gone.
5541 LOG.error('Failed to delete port bindings from source host. '
5542 'Error: %s', str(e), instance=instance)
5544 def _delete_volume_attachments(self, ctxt, bdms):
5545 """Deletes volume attachment records for the given bdms.
5547 This method will log but not re-raise any exceptions if the volume
5548 attachment delete fails.
5550 :param ctxt: nova auth request context used to make
5551 DELETE /attachments/{attachment_id} requests to cinder.
5552 :param bdms: objects.BlockDeviceMappingList representing volume
5553 attachments to delete based on BlockDeviceMapping.attachment_id.
5554 """
5555 for bdm in bdms:
5556 if bdm.attachment_id:
5557 try:
5558 self.volume_api.attachment_delete(ctxt, bdm.attachment_id)
5559 except Exception as e:
5560 LOG.error('Failed to delete volume attachment with ID %s. '
5561 'Error: %s', bdm.attachment_id, str(e),
5562 instance_uuid=bdm.instance_uuid)
5564 def _update_bdm_for_swap_to_finish_resize(
5565 self, context, instance, confirm=True):
5566 """This updates bdm.swap with new swap info"""
5568 bdms = instance.get_bdms()
5569 if not (instance.old_flavor and instance.new_flavor):
5570 return bdms
5572 if instance.old_flavor.swap == instance.new_flavor.swap:
5573 return bdms
5575 old_swap = instance.old_flavor.swap
5576 new_swap = instance.new_flavor.swap
5577 if not confirm: 5577 ↛ 5579line 5577 didn't jump to line 5579 because the condition on line 5577 was never true
5578 # revert flavor on _finish_revert_resize
5579 old_swap = instance.new_flavor.swap
5580 new_swap = instance.old_flavor.swap
5582 # add swap
5583 if old_swap == 0 and new_swap:
5584 # (auniyal)old_swap = 0 means we did not have swap bdm
5585 # for this instance.
5586 # and as there is a new_swap, its a swap addition
5587 new_swap_bdm = block_device.create_blank_bdm(new_swap, 'swap')
5588 bdm_obj = objects.BlockDeviceMapping(
5589 context, instance_uuid=instance.uuid, **new_swap_bdm)
5590 bdm_obj.update_or_create()
5591 return instance.get_bdms()
5593 # update swap
5594 for bdm in bdms: 5594 ↛ 5606line 5594 didn't jump to line 5606 because the loop on line 5594 didn't complete
5595 if bdm.guest_format == 'swap' and bdm.device_type == 'disk': 5595 ↛ 5594line 5595 didn't jump to line 5594 because the condition on line 5595 was always true
5596 if new_swap > 0:
5597 LOG.info('Adding swap BDM.', instance=instance)
5598 bdm.volume_size = new_swap
5599 bdm.save()
5600 break
5601 elif new_swap == 0: 5601 ↛ 5594line 5601 didn't jump to line 5594 because the condition on line 5601 was always true
5602 LOG.info('Deleting swap BDM.', instance=instance)
5603 bdm.destroy()
5604 bdms.objects.remove(bdm)
5605 break
5606 return bdms
5608 @wrap_exception()
5609 @reverts_task_state
5610 @wrap_instance_event(prefix='compute')
5611 @errors_out_migration
5612 @wrap_instance_fault
5613 def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration):
5614 """Reverts a snapshot-based resize at the destination host.
5616 Cleans the guest from the destination compute service host hypervisor
5617 and related resources (ports, volumes) and frees resource usage from
5618 the compute service on that host.
5620 :param ctxt: nova auth request context targeted at the target cell
5621 :param instance: Instance object whose vm_state is "resized" and
5622 task_state is "resize_reverting".
5623 :param migration: Migration object whose status is "reverting".
5624 """
5625 # A resize revert is essentially a resize back to the old size, so we
5626 # need to send a usage event here.
5627 compute_utils.notify_usage_exists(
5628 self.notifier, ctxt, instance, self.host, current_period=True)
5630 @utils.synchronized(instance.uuid)
5631 def do_revert():
5632 LOG.info('Reverting resize on destination host.',
5633 instance=instance)
5634 with self._error_out_instance_on_exception(ctxt, instance):
5635 self._revert_snapshot_based_resize_at_dest(
5636 ctxt, instance, migration)
5637 do_revert()
5639 # Broadcast to all schedulers that the instance is no longer on
5640 # this host and clear any waiting callback events. This is best effort
5641 # so if anything fails just log it.
5642 try:
5643 self._delete_scheduler_instance_info(ctxt, instance.uuid)
5644 self.instance_events.clear_events_for_instance(instance)
5645 except Exception as e:
5646 LOG.warning('revert_snapshot_based_resize_at_dest failed during '
5647 'post-processing. Error: %s', e, instance=instance)
5649 def _revert_snapshot_based_resize_at_dest(
5650 self, ctxt, instance, migration):
5651 """Private version of revert_snapshot_based_resize_at_dest.
5653 This allows the main method to be decorated with error handlers.
5655 :param ctxt: nova auth request context targeted at the target cell
5656 :param instance: Instance object whose vm_state is "resized" and
5657 task_state is "resize_reverting".
5658 :param migration: Migration object whose status is "reverting".
5659 """
5660 # Cleanup the guest from the hypervisor including local disks.
5661 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
5662 bdms = instance.get_bdms()
5663 block_device_info = self._get_instance_block_device_info(
5664 ctxt, instance, bdms=bdms)
5665 LOG.debug('Destroying guest from destination hypervisor including '
5666 'disks.', instance=instance)
5667 self.driver.destroy(
5668 ctxt, instance, network_info, block_device_info=block_device_info)
5670 # Activate source host port bindings. We need to do this before
5671 # deleting the (active) dest host port bindings in
5672 # setup_networks_on_host otherwise the ports will be unbound and
5673 # finish on the source will fail.
5674 # migrate_instance_start uses migration.dest_compute for the port
5675 # binding host and since we want to activate the source host port
5676 # bindings, we need to temporarily mutate the migration object.
5677 with utils.temporary_mutation(
5678 migration, dest_compute=migration.source_compute):
5679 LOG.debug('Activating port bindings for source host %s.',
5680 migration.source_compute, instance=instance)
5681 # TODO(mriedem): https://review.opendev.org/#/c/594139/ would allow
5682 # us to remove this and make setup_networks_on_host do it.
5683 # TODO(mriedem): Should we try/except/log any errors but continue?
5684 self.network_api.migrate_instance_start(
5685 ctxt, instance, migration)
5687 # Delete port bindings for the target host.
5688 LOG.debug('Deleting port bindings for target host %s.',
5689 self.host, instance=instance)
5690 try:
5691 # Note that deleting the destination host port bindings does
5692 # not automatically activate the source host port bindings.
5693 self.network_api.cleanup_instance_network_on_host(
5694 ctxt, instance, self.host)
5695 except exception.PortBindingDeletionFailed as e:
5696 # Do not let this stop us from cleaning up since the guest
5697 # is already gone.
5698 LOG.error('Failed to delete port bindings from target host. '
5699 'Error: %s', str(e), instance=instance)
5701 # Delete any volume attachments remaining for this target host.
5702 LOG.debug('Deleting volume attachments for target host.',
5703 instance=instance)
5704 self._delete_volume_attachments(ctxt, bdms)
5706 # Free up the new_flavor usage from the resource tracker for this host.
5707 self.rt.drop_move_claim_at_dest(ctxt, instance, migration)
5709 def _revert_instance_flavor_host_node(self, instance, migration):
5710 """Revert host, node and flavor fields after a resize-revert."""
5711 self._set_instance_info(instance, instance.old_flavor)
5712 instance.host = migration.source_compute
5713 # NOTE(danms): This could fail if we somehow ended up on the wrong
5714 # host without the desired node. That's a big problem, so let the
5715 # ComputeHostNotFound raise from here.
5716 cn = self.rt.get_node_by_name(migration.source_node)
5717 instance.node = cn.hypervisor_hostname
5718 instance.compute_id = cn.id
5719 instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
5721 @wrap_exception()
5722 @reverts_task_state
5723 @wrap_instance_event(prefix='compute')
5724 @errors_out_migration
5725 @wrap_instance_fault
5726 def finish_revert_snapshot_based_resize_at_source(
5727 self, ctxt, instance, migration):
5728 """Reverts a snapshot-based resize at the source host.
5730 Spawn the guest and re-connect volumes/VIFs on the source host and
5731 revert the instance to use the old_flavor for resource usage reporting.
5733 Updates allocations in the placement service to move the source node
5734 allocations, held by the migration record, to the instance and drop
5735 the allocations held by the instance on the destination node.
5737 :param ctxt: nova auth request context targeted at the target cell
5738 :param instance: Instance object whose vm_state is "resized" and
5739 task_state is "resize_reverting".
5740 :param migration: Migration object whose status is "reverting".
5741 """
5743 @utils.synchronized(instance.uuid)
5744 def do_revert():
5745 LOG.info('Reverting resize on source host.', instance=instance)
5746 with self._error_out_instance_on_exception(ctxt, instance):
5747 self._finish_revert_snapshot_based_resize_at_source(
5748 ctxt, instance, migration)
5750 try:
5751 do_revert()
5752 finally:
5753 self._delete_stashed_flavor_info(instance)
5755 # Broadcast to all schedulers that the instance is on this host.
5756 # This is best effort so if anything fails just log it.
5757 try:
5758 self._update_scheduler_instance_info(ctxt, instance)
5759 except Exception as e:
5760 LOG.warning('finish_revert_snapshot_based_resize_at_source failed '
5761 'during post-processing. Error: %s', e,
5762 instance=instance)
5764 def _finish_revert_snapshot_based_resize_at_source(
5765 self, ctxt, instance, migration):
5766 """Private version of finish_revert_snapshot_based_resize_at_source.
5768 This allows the main method to be decorated with error handlers.
5770 :param ctxt: nova auth request context targeted at the source cell
5771 :param instance: Instance object whose vm_state is "resized" and
5772 task_state is "resize_reverting".
5773 :param migration: Migration object whose status is "reverting".
5774 """
5775 # Get stashed old_vm_state information to determine if guest should
5776 # be powered on after spawn; we default to ACTIVE for backwards
5777 # compatibility if old_vm_state is not set
5778 old_vm_state = instance.system_metadata.get(
5779 'old_vm_state', vm_states.ACTIVE)
5781 # Revert the flavor and host/node fields to their previous values
5782 self._revert_instance_flavor_host_node(instance, migration)
5784 # Move the allocations against the source compute node resource
5785 # provider, held by the migration, to the instance which will drop
5786 # the destination compute node resource provider allocations held by
5787 # the instance. This puts the allocations against the source node
5788 # back to the old_flavor and owned by the instance.
5789 try:
5790 self._revert_allocation(ctxt, instance, migration)
5791 except exception.AllocationMoveFailed:
5792 # Log the error but do not re-raise because we want to continue to
5793 # process ports and volumes below.
5794 LOG.error('Reverting allocation in placement for migration '
5795 '%(migration_uuid)s failed. You may need to manually '
5796 'remove the allocations for the migration consumer '
5797 'against the source node resource provider '
5798 '%(source_provider)s and the allocations for the '
5799 'instance consumer against the destination node '
5800 'resource provider %(dest_provider)s and then run the '
5801 '"nova-manage placement heal_allocations" command.',
5802 {'instance_uuid': instance.uuid,
5803 'migration_uuid': migration.uuid,
5804 'source_provider': migration.source_node,
5805 'dest_provider': migration.dest_node},
5806 instance=instance)
5808 bdms = instance.get_bdms()
5809 # prep_snapshot_based_resize_at_source created empty volume attachments
5810 # that we need to update here to get the connection_info before calling
5811 # driver.finish_revert_migration which will connect the volumes to this
5812 # host.
5813 LOG.debug('Updating volume attachments for target host %s.',
5814 self.host, instance=instance)
5815 # TODO(mriedem): We should probably make _update_volume_attachments
5816 # (optionally) graceful to errors so we (1) try to process all
5817 # attachments and (2) continue to process networking below.
5818 self._update_volume_attachments(ctxt, instance, bdms)
5820 LOG.debug('Updating port bindings for source host %s.',
5821 self.host, instance=instance)
5822 # TODO(mriedem): Calculate provider mappings when we support
5823 # cross-cell resize/migrate with ports having resource requests.
5824 # NOTE(hanrong): we need to change migration.dest_compute to
5825 # source host temporarily.
5826 # "network_api.migrate_instance_finish" will setup the network
5827 # for the instance on the destination host. For revert resize,
5828 # the instance will back to the source host, the setup of the
5829 # network for instance should be on the source host. So set
5830 # the migration.dest_compute to source host at here.
5831 with utils.temporary_mutation(
5832 migration, dest_compute=migration.source_compute
5833 ):
5834 self.network_api.migrate_instance_finish(
5835 ctxt, instance, migration, provider_mappings=None)
5836 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
5838 # Remember that prep_snapshot_based_resize_at_source destroyed the
5839 # guest but left the disks intact so we cannot call spawn() here but
5840 # finish_revert_migration should do the job.
5841 block_device_info = self._get_instance_block_device_info(
5842 ctxt, instance, bdms=bdms)
5843 power_on = old_vm_state == vm_states.ACTIVE
5844 driver_error = None
5845 try:
5846 self.driver.finish_revert_migration(
5847 ctxt, instance, network_info, migration,
5848 block_device_info=block_device_info, power_on=power_on)
5849 except Exception as e:
5850 driver_error = e
5851 # Leave a hint about hard rebooting the guest and reraise so the
5852 # instance is put into ERROR state.
5853 with excutils.save_and_reraise_exception(logger=LOG):
5854 LOG.error('An error occurred during finish_revert_migration. '
5855 'The instance may need to be hard rebooted. Error: '
5856 '%s', driver_error, instance=instance)
5857 else:
5858 # Perform final cleanup of the instance in the database.
5859 instance.drop_migration_context()
5860 # If the original vm_state was STOPPED, set it back to STOPPED.
5861 vm_state = vm_states.ACTIVE if power_on else vm_states.STOPPED
5862 self._update_instance_after_spawn(instance, vm_state=vm_state)
5863 instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
5864 finally:
5865 # Complete any volume attachments so the volumes are in-use. We
5866 # do this regardless of finish_revert_migration failing because
5867 # the instance is back on this host now and we do not want to leave
5868 # the volumes in a pending state in case the instance is hard
5869 # rebooted.
5870 LOG.debug('Completing volume attachments for instance on source '
5871 'host.', instance=instance)
5872 with excutils.save_and_reraise_exception(
5873 reraise=driver_error is not None, logger=LOG):
5874 self._complete_volume_attachments(ctxt, bdms)
5876 migration.status = 'reverted'
5877 migration.save()
5879 @wrap_exception()
5880 @reverts_task_state
5881 @wrap_instance_event(prefix='compute')
5882 @errors_out_migration
5883 @wrap_instance_fault
5884 def revert_resize(self, context, instance, migration, request_spec):
5885 """Destroys the new instance on the destination machine.
5887 Reverts the model changes, and powers on the old instance on the
5888 source machine.
5890 """
5891 # NOTE(comstud): A revert_resize is essentially a resize back to
5892 # the old size, so we need to send a usage event here.
5893 compute_utils.notify_usage_exists(self.notifier, context, instance,
5894 self.host, current_period=True)
5896 with self._error_out_instance_on_exception(context, instance):
5897 # NOTE(tr3buchet): tear down networks on destination host
5898 self.network_api.setup_networks_on_host(context, instance,
5899 teardown=True)
5901 self.network_api.migrate_instance_start(context,
5902 instance,
5903 migration)
5905 network_info = self.network_api.get_instance_nw_info(context,
5906 instance)
5907 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
5908 context, instance.uuid)
5909 block_device_info = self._get_instance_block_device_info(
5910 context, instance, bdms=bdms)
5912 destroy_disks = not self._is_instance_storage_shared(
5913 context, instance, host=migration.source_compute)
5914 self.driver.destroy(context, instance, network_info,
5915 block_device_info, destroy_disks)
5917 self._terminate_volume_connections(context, instance, bdms)
5919 # Free up the new_flavor usage from the resource tracker for this
5920 # host.
5921 self.rt.drop_move_claim_at_dest(context, instance, migration)
5923 # RPC cast back to the source host to finish the revert there.
5924 self.compute_rpcapi.finish_revert_resize(context, instance,
5925 migration, migration.source_compute, request_spec)
5927 @wrap_exception()
5928 @reverts_task_state
5929 @wrap_instance_event(prefix='compute')
5930 @errors_out_migration
5931 @wrap_instance_fault
5932 def finish_revert_resize(self, context, instance, migration, request_spec):
5933 """Finishes the second half of reverting a resize on the source host.
5935 Bring the original source instance state back (active/shutoff) and
5936 revert the resized attributes in the database.
5938 """
5939 try:
5940 self._finish_revert_resize(
5941 context, instance, migration, request_spec)
5942 finally:
5943 self._delete_stashed_flavor_info(instance)
5945 def _finish_revert_resize(
5946 self, context, instance, migration, request_spec=None,
5947 ):
5948 """Inner version of finish_revert_resize."""
5949 with self._error_out_instance_on_exception(context, instance):
5950 bdms = self._update_bdm_for_swap_to_finish_resize(
5951 context, instance, confirm=False)
5953 self._notify_about_instance_usage(
5954 context, instance, "resize.revert.start")
5955 compute_utils.notify_about_instance_action(context, instance,
5956 self.host, action=fields.NotificationAction.RESIZE_REVERT,
5957 phase=fields.NotificationPhase.START, bdms=bdms)
5959 # Get stashed old_vm_state information to determine if guest should
5960 # be powered on after spawn; we default to ACTIVE for backwards
5961 # compatibility if old_vm_state is not set
5962 old_vm_state = instance.system_metadata.get(
5963 'old_vm_state', vm_states.ACTIVE)
5965 # Revert the flavor and host/node fields to their previous values
5966 self._revert_instance_flavor_host_node(instance, migration)
5968 try:
5969 source_allocations = self._revert_allocation(
5970 context, instance, migration)
5971 except exception.AllocationMoveFailed:
5972 LOG.error('Reverting allocation in placement for migration '
5973 '%(migration_uuid)s failed. The instance '
5974 '%(instance_uuid)s will be put into ERROR state but '
5975 'the allocation held by the migration is leaked.',
5976 {'instance_uuid': instance.uuid,
5977 'migration_uuid': migration.uuid})
5978 raise
5980 provider_mappings = self._fill_provider_mapping_based_on_allocs(
5981 context, source_allocations, request_spec)
5983 self.network_api.setup_networks_on_host(context, instance,
5984 migration.source_compute)
5985 # NOTE(hanrong): we need to change migration.dest_compute to
5986 # source host temporarily. "network_api.migrate_instance_finish"
5987 # will setup the network for the instance on the destination host.
5988 # For revert resize, the instance will back to the source host, the
5989 # setup of the network for instance should be on the source host.
5990 # So set the migration.dest_compute to source host at here.
5991 with utils.temporary_mutation(
5992 migration, dest_compute=migration.source_compute):
5993 self.network_api.migrate_instance_finish(context,
5994 instance,
5995 migration,
5996 provider_mappings)
5997 network_info = self.network_api.get_instance_nw_info(context,
5998 instance)
6000 # revert_resize deleted any volume attachments for the instance
6001 # and created new ones to be used on this host, but we
6002 # have to update those attachments with the host connector so the
6003 # BDM.connection_info will get set in the call to
6004 # _get_instance_block_device_info below with refresh_conn_info=True
6005 # and then the volumes can be re-connected via the driver on this
6006 # host.
6007 self._update_volume_attachments(context, instance, bdms)
6009 block_device_info = self._get_instance_block_device_info(
6010 context, instance, refresh_conn_info=True, bdms=bdms)
6012 power_on = old_vm_state != vm_states.STOPPED
6013 self.driver.finish_revert_migration(
6014 context, instance, network_info, migration, block_device_info,
6015 power_on)
6017 instance.drop_migration_context()
6018 instance.launched_at = timeutils.utcnow()
6019 instance.save(expected_task_state=task_states.RESIZE_REVERTING)
6021 # Complete any volume attachments so the volumes are in-use.
6022 self._complete_volume_attachments(context, bdms)
6024 # if the original vm state was STOPPED, set it back to STOPPED
6025 LOG.info("Updating instance to original state: '%s'",
6026 old_vm_state, instance=instance)
6027 if power_on: 6027 ↛ 6032line 6027 didn't jump to line 6032 because the condition on line 6027 was always true
6028 instance.vm_state = vm_states.ACTIVE
6029 instance.task_state = None
6030 instance.save()
6031 else:
6032 instance.task_state = task_states.POWERING_OFF
6033 instance.save()
6034 self.stop_instance(context, instance=instance,
6035 clean_shutdown=True)
6037 self._notify_about_instance_usage(
6038 context, instance, "resize.revert.end")
6039 compute_utils.notify_about_instance_action(context, instance,
6040 self.host, action=fields.NotificationAction.RESIZE_REVERT,
6041 phase=fields.NotificationPhase.END, bdms=bdms)
6043 def _fill_provider_mapping_based_on_allocs(
6044 self, context, allocations, request_spec):
6045 """Fills and returns the request group - resource provider mapping
6046 based on the allocation passed in.
6048 :param context: The security context
6049 :param allocation: allocation dict keyed by RP UUID.
6050 :param request_spec: The RequestSpec object associated with the
6051 operation
6052 :returns: None if the request_spec is None. Otherwise a mapping
6053 between RequestGroup requester_id, currently Neutron port_id,
6054 and a list of resource provider UUIDs providing resource for
6055 that RequestGroup.
6056 """
6057 if request_spec:
6058 # NOTE(gibi): We need to re-calculate the resource provider -
6059 # port mapping as we have to have the neutron ports allocate
6060 # from the source compute after revert.
6061 scheduler_utils.fill_provider_mapping_based_on_allocation(
6062 context, self.reportclient, request_spec, allocations)
6063 provider_mappings = self._get_request_group_mapping(
6064 request_spec)
6065 else:
6066 # TODO(sbauza): Remove this conditional once we only support RPC
6067 # API 6.0
6068 # NOTE(gibi): The compute RPC is pinned to be older than 5.2
6069 # and therefore request_spec is not sent. We cannot calculate
6070 # the provider mappings. If the instance has ports with
6071 # resource request then the port update will fail in
6072 # _update_port_binding_for_instance() called via
6073 # migrate_instance_finish() in finish_revert_resize.
6074 provider_mappings = None
6075 return provider_mappings
6077 def _revert_allocation(self, context, instance, migration):
6078 """Revert an allocation that is held by migration to our instance."""
6080 # Fetch the original allocation that the instance had on the source
6081 # node, which are now held by the migration
6082 orig_alloc = self.reportclient.get_allocations_for_consumer(
6083 context, migration.uuid)
6084 if not orig_alloc:
6085 LOG.error('Did not find resource allocations for migration '
6086 '%s on source node %s. Unable to revert source node '
6087 'allocations back to the instance.',
6088 migration.uuid, migration.source_node, instance=instance)
6089 return False
6091 LOG.info('Swapping old allocation on %(rp_uuids)s held by migration '
6092 '%(mig)s for instance',
6093 {'rp_uuids': orig_alloc.keys(), 'mig': migration.uuid},
6094 instance=instance)
6095 # FIXME(gibi): This method is flawed in that it does not handle
6096 # allocations against sharing providers in any special way. This leads
6097 # to duplicate allocations against the sharing provider during
6098 # migration.
6099 # TODO(cdent): Should we be doing anything with return values here?
6100 self.reportclient.move_allocations(context, migration.uuid,
6101 instance.uuid)
6102 return orig_alloc
6104 def _prep_resize(
6105 self, context, image, instance, flavor, filter_properties, node,
6106 migration, request_spec, clean_shutdown=True,
6107 ):
6109 if not filter_properties: 6109 ↛ 6112line 6109 didn't jump to line 6112 because the condition on line 6109 was always true
6110 filter_properties = {}
6112 if not instance.host:
6113 self._set_instance_obj_error_state(instance)
6114 msg = _('Instance has no source host')
6115 raise exception.MigrationError(reason=msg)
6117 same_host = instance.host == self.host
6118 # if the flavor IDs match, it's migrate; otherwise resize
6119 if same_host and flavor.id == instance['instance_type_id']:
6120 # check driver whether support migrate to same host
6121 if not self.driver.capabilities.get(
6122 'supports_migrate_to_same_host', False):
6123 # Raise InstanceFaultRollback so that the
6124 # _error_out_instance_on_exception context manager in
6125 # prep_resize will set the instance.vm_state properly.
6126 raise exception.InstanceFaultRollback(
6127 inner_exception=exception.UnableToMigrateToSelf(
6128 instance_id=instance.uuid, host=self.host))
6130 # NOTE(danms): Stash the new flavor to avoid having to
6131 # look it up in the database later
6132 instance.new_flavor = flavor
6133 # NOTE(mriedem): Stash the old vm_state so we can set the
6134 # resized/reverted instance back to the same state later.
6135 vm_state = instance.vm_state
6136 LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
6137 instance.system_metadata['old_vm_state'] = vm_state
6138 instance.save()
6140 if not isinstance(request_spec, objects.RequestSpec):
6141 # Prior to compute RPC API 5.1 conductor would pass a legacy dict
6142 # version of the request spec to compute and since Stein compute
6143 # could be sending that back to conductor on reschedule, so if we
6144 # got a dict convert it to an object.
6145 # TODO(mriedem): We can drop this compat code when we only support
6146 # compute RPC API >=6.0.
6147 request_spec = objects.RequestSpec.from_primitives(
6148 context, request_spec, filter_properties)
6149 # We don't have to set the new flavor on the request spec because
6150 # if we got here it was due to a reschedule from the compute and
6151 # the request spec would already have the new flavor in it from the
6152 # else block below.
6154 provider_mapping = self._get_request_group_mapping(request_spec)
6156 if provider_mapping:
6157 try:
6158 compute_utils.update_pci_request_with_placement_allocations(
6159 context,
6160 self.reportclient,
6161 instance.pci_requests.requests,
6162 provider_mapping,
6163 )
6164 except (exception.AmbiguousResourceProviderForPCIRequest,
6165 exception.UnexpectedResourceProviderNameForPCIRequest
6166 ) as e:
6167 raise exception.BuildAbortException(
6168 reason=str(e), instance_uuid=instance.uuid)
6170 limits = filter_properties.get('limits', {})
6171 allocs = self.reportclient.get_allocations_for_consumer(
6172 context, instance.uuid)
6173 with self.rt.resize_claim(
6174 context, instance, flavor, node, migration, allocs,
6175 image_meta=image, limits=limits,
6176 ) as claim:
6177 LOG.info('Migrating', instance=instance)
6178 # RPC cast to the source host to start the actual resize/migration.
6179 self.compute_rpcapi.resize_instance(
6180 context, instance, claim.migration, image,
6181 flavor, request_spec, clean_shutdown)
6183 def _send_prep_resize_notifications(
6184 self, context, instance, phase, flavor):
6185 """Send "resize.prep.*" notifications.
6187 :param context: nova auth request context
6188 :param instance: The instance being resized
6189 :param phase: The phase of the action (NotificationPhase enum)
6190 :param flavor: The (new) flavor for the resize (same as existing
6191 instance.flavor for a cold migration)
6192 """
6193 # Only send notify_usage_exists if it's the "start" phase.
6194 if phase == fields.NotificationPhase.START:
6195 compute_utils.notify_usage_exists(
6196 self.notifier, context, instance, self.host,
6197 current_period=True)
6199 # Send extra usage info about the flavor if it's the "end" phase for
6200 # the legacy unversioned notification.
6201 extra_usage_info = None
6202 if phase == fields.NotificationPhase.END:
6203 extra_usage_info = dict(
6204 new_instance_type=flavor.name,
6205 new_instance_type_id=flavor.id)
6206 self._notify_about_instance_usage(
6207 context, instance, "resize.prep.%s" % phase,
6208 extra_usage_info=extra_usage_info)
6210 # Send the versioned notification.
6211 compute_utils.notify_about_resize_prep_instance(
6212 context, instance, self.host, phase, flavor)
6214 @wrap_exception()
6215 @reverts_task_state
6216 @wrap_instance_event(prefix='compute')
6217 @wrap_instance_fault
6218 def prep_resize(self, context, image, instance, flavor,
6219 request_spec, filter_properties, node,
6220 clean_shutdown, migration, host_list):
6221 """Initiates the process of moving a running instance to another host.
6223 Possibly changes the VCPU, RAM and disk size in the process.
6225 This is initiated from conductor and runs on the destination host.
6227 The main purpose of this method is performing some checks on the
6228 destination host and making a claim for resources. If the claim fails
6229 then a reschedule to another host may be attempted which involves
6230 calling back to conductor to start the process over again.
6231 """
6232 if node is None:
6233 node = self._get_nodename(instance, refresh=True)
6235 # Pass instance_state=instance.vm_state because we can resize
6236 # a STOPPED server and we don't want to set it back to ACTIVE
6237 # in case _prep_resize fails.
6238 instance_state = instance.vm_state
6239 with self._error_out_instance_on_exception(
6240 context, instance, instance_state=instance_state), \
6241 errors_out_migration_ctxt(migration):
6243 self._send_prep_resize_notifications(
6244 context, instance, fields.NotificationPhase.START,
6245 flavor)
6246 try:
6247 scheduler_hints = self._get_scheduler_hints(filter_properties,
6248 request_spec)
6249 # Error out if this host cannot accept the new instance due
6250 # to anti-affinity. At this point the migration is already
6251 # in-progress, so this is the definitive moment to abort due to
6252 # the policy violation. Also, exploding here is covered by the
6253 # cleanup methods in except block.
6254 try:
6255 self._validate_instance_group_policy(context, instance,
6256 scheduler_hints)
6257 except exception.RescheduledException as e:
6258 raise exception.InstanceFaultRollback(inner_exception=e)
6260 self._prep_resize(context, image, instance,
6261 flavor, filter_properties,
6262 node, migration, request_spec,
6263 clean_shutdown)
6264 except exception.BuildAbortException:
6265 # NOTE(gibi): We failed
6266 # update_pci_request_with_placement_allocations so
6267 # there is no reason to re-schedule. Just revert the allocation
6268 # and fail the migration.
6269 with excutils.save_and_reraise_exception():
6270 self._revert_allocation(context, instance, migration)
6271 except Exception:
6272 # Since we hit a failure, we're either rescheduling or dead
6273 # and either way we need to cleanup any allocations created
6274 # by the scheduler for the destination node.
6275 self._revert_allocation(context, instance, migration)
6276 # try to re-schedule the resize elsewhere:
6277 exc_info = sys.exc_info()
6278 self._reschedule_resize_or_reraise(context, instance,
6279 exc_info, flavor, request_spec,
6280 filter_properties, host_list)
6281 finally:
6282 self._send_prep_resize_notifications(
6283 context, instance, fields.NotificationPhase.END,
6284 flavor)
6286 def _reschedule_resize_or_reraise(self, context, instance, exc_info,
6287 flavor, request_spec, filter_properties, host_list):
6288 """Try to re-schedule the resize or re-raise the original error to
6289 error out the instance.
6290 """
6291 if not filter_properties:
6292 filter_properties = {}
6294 rescheduled = False
6295 instance_uuid = instance.uuid
6297 try:
6298 retry = filter_properties.get('retry')
6299 if retry:
6300 LOG.debug('Rescheduling, attempt %d', retry['num_attempts'],
6301 instance_uuid=instance_uuid)
6303 # reset the task state
6304 task_state = task_states.RESIZE_PREP
6305 self._instance_update(context, instance, task_state=task_state)
6307 if exc_info: 6307 ↛ 6313line 6307 didn't jump to line 6313 because the condition on line 6307 was always true
6308 # stringify to avoid circular ref problem in json
6309 # serialization
6310 retry['exc'] = traceback.format_exception_only(
6311 exc_info[0], exc_info[1])
6313 scheduler_hint = {'filter_properties': filter_properties}
6315 self.compute_task_api.resize_instance(
6316 context, instance, scheduler_hint, flavor,
6317 request_spec=request_spec, host_list=host_list)
6319 rescheduled = True
6320 else:
6321 # no retry information, do not reschedule.
6322 LOG.debug('Retry info not present, will not reschedule',
6323 instance_uuid=instance_uuid)
6324 rescheduled = False
6325 except Exception as error:
6326 rescheduled = False
6327 LOG.exception("Error trying to reschedule",
6328 instance_uuid=instance_uuid)
6329 compute_utils.add_instance_fault_from_exc(context,
6330 instance, error,
6331 exc_info=sys.exc_info())
6332 self._notify_about_instance_usage(context, instance,
6333 'resize.error', fault=error)
6334 compute_utils.notify_about_instance_action(
6335 context, instance, self.host,
6336 action=fields.NotificationAction.RESIZE,
6337 phase=fields.NotificationPhase.ERROR,
6338 exception=error,
6339 )
6341 if rescheduled:
6342 self._log_original_error(exc_info, instance_uuid)
6343 compute_utils.add_instance_fault_from_exc(context,
6344 instance, exc_info[1], exc_info=exc_info)
6345 self._notify_about_instance_usage(context, instance,
6346 'resize.error', fault=exc_info[1])
6347 compute_utils.notify_about_instance_action(
6348 context, instance, self.host,
6349 action=fields.NotificationAction.RESIZE,
6350 phase=fields.NotificationPhase.ERROR,
6351 exception=exc_info[1],
6352 )
6353 else:
6354 # not re-scheduling
6355 exc = exc_info[1] or exc_info[0]()
6356 if exc.__traceback__ is not exc_info[2]: 6356 ↛ 6357line 6356 didn't jump to line 6357 because the condition on line 6356 was never true
6357 raise exc.with_traceback(exc_info[2])
6358 raise exc
6360 @messaging.expected_exceptions(exception.MigrationPreCheckError)
6361 @wrap_exception()
6362 @wrap_instance_event(prefix='compute')
6363 @wrap_instance_fault
6364 def prep_snapshot_based_resize_at_dest(
6365 self, ctxt, instance, flavor, nodename, migration, limits):
6366 """Performs pre-cross-cell resize resource claim on the dest host.
6368 This runs on the destination host in a cross-cell resize operation
6369 before the resize is actually started.
6371 Performs a resize_claim for resources that are not claimed in placement
6372 like PCI devices and NUMA topology.
6374 Note that this is different from same-cell prep_resize in that this:
6376 * Does not RPC cast to the source compute, that is orchestrated from
6377 conductor.
6378 * This does not reschedule on failure, conductor handles that since
6379 conductor is synchronously RPC calling this method. As such, the
6380 reverts_task_state decorator is not used on this method.
6382 :param ctxt: user auth request context
6383 :param instance: the instance being resized
6384 :param flavor: the flavor being resized to (unchanged for cold migrate)
6385 :param nodename: Name of the target compute node
6386 :param migration: nova.objects.Migration object for the operation
6387 :param limits: nova.objects.SchedulerLimits object of resource limits
6388 :returns: nova.objects.MigrationContext; the migration context created
6389 on the destination host during the resize_claim.
6390 :raises: nova.exception.MigrationPreCheckError if the pre-check
6391 validation fails for the given host selection
6392 """
6393 LOG.debug('Checking if we can cross-cell migrate instance to this '
6394 'host (%s).', self.host, instance=instance)
6395 self._send_prep_resize_notifications(
6396 ctxt, instance, fields.NotificationPhase.START, flavor)
6397 # TODO(mriedem): update_pci_request_with_placement_allocations
6398 # should be called here if the request spec has request group mappings,
6399 # e.g. for things like QoS ports with resource requests. Do it outside
6400 # the try/except so if it raises BuildAbortException we do not attempt
6401 # to reschedule.
6402 try:
6403 # Get the allocations within the try/except block in case we get
6404 # an error so MigrationPreCheckError is raised up.
6405 allocations = self.reportclient.get_allocs_for_consumer(
6406 ctxt, instance.uuid)['allocations']
6407 # Claim resources on this target host using the new flavor which
6408 # will create the MigrationContext object. Note that in the future
6409 # if we want to do other validation here we should do it within
6410 # the MoveClaim context so we can drop the claim if anything fails.
6411 self.rt.resize_claim(
6412 ctxt, instance, flavor, nodename, migration, allocations,
6413 image_meta=instance.image_meta, limits=limits)
6414 except Exception as ex:
6415 err = str(ex)
6416 LOG.warning(
6417 'Cross-cell resize pre-checks failed for this host (%s). '
6418 'Cleaning up. Failure: %s', self.host, err,
6419 instance=instance, exc_info=True)
6420 raise exception.MigrationPreCheckError(
6421 reason=(_("Pre-checks failed on host '%(host)s'. "
6422 "Error: %(error)s") %
6423 {'host': self.host, 'error': err}))
6424 finally:
6425 self._send_prep_resize_notifications(
6426 ctxt, instance, fields.NotificationPhase.END, flavor)
6428 # ResourceTracker.resize_claim() sets instance.migration_context.
6429 return instance.migration_context
6431 @messaging.expected_exceptions(exception.InstancePowerOffFailure)
6432 @wrap_exception()
6433 @reverts_task_state
6434 @wrap_instance_event(prefix='compute')
6435 @errors_out_migration
6436 @wrap_instance_fault
6437 def prep_snapshot_based_resize_at_source(
6438 self, ctxt, instance, migration, snapshot_id=None):
6439 """Prepares the instance at the source host for cross-cell resize
6441 Performs actions like powering off the guest, upload snapshot data if
6442 the instance is not volume-backed, disconnecting volumes, unplugging
6443 VIFs and activating the destination host port bindings.
6445 :param ctxt: user auth request context targeted at source cell
6446 :param instance: nova.objects.Instance; the instance being resized.
6447 The expected instance.task_state is "resize_migrating" when calling
6448 this method, and the expected task_state upon successful completion
6449 is "resize_migrated".
6450 :param migration: nova.objects.Migration object for the operation.
6451 The expected migration.status is "pre-migrating" when calling this
6452 method and the expected status upon successful completion is
6453 "post-migrating".
6454 :param snapshot_id: ID of the image snapshot to upload if not a
6455 volume-backed instance
6456 :raises: nova.exception.InstancePowerOffFailure if stopping the
6457 instance fails
6458 """
6459 LOG.info('Preparing for snapshot based resize on source host %s.',
6460 self.host, instance=instance)
6461 # Note that if anything fails here, the migration-based allocations
6462 # created in conductor should be reverted by conductor as well,
6463 # see MigrationTask.rollback.
6464 self._prep_snapshot_based_resize_at_source(
6465 ctxt, instance, migration, snapshot_id=snapshot_id)
6467 @delete_image_on_error
6468 def _snapshot_for_resize(self, ctxt, image_id, instance):
6469 """Uploads snapshot for the instance during a snapshot-based resize
6471 If the snapshot operation fails the image will be deleted.
6473 :param ctxt: the nova auth request context for the resize operation
6474 :param image_id: the snapshot image ID
6475 :param instance: the instance to snapshot/resize
6476 """
6477 LOG.debug('Uploading snapshot data for image %s', image_id,
6478 instance=instance)
6479 # Note that we do not track the snapshot phase task states
6480 # during resize since we do not want to reflect those into the
6481 # actual instance.task_state.
6482 update_task_state = lambda *args, **kwargs: None
6483 with timeutils.StopWatch() as timer:
6484 self.driver.snapshot(ctxt, instance, image_id, update_task_state)
6485 LOG.debug('Took %0.2f seconds to snapshot the instance on '
6486 'the hypervisor.', timer.elapsed(), instance=instance)
6488 def _prep_snapshot_based_resize_at_source(
6489 self, ctxt, instance, migration, snapshot_id=None):
6490 """Private method for prep_snapshot_based_resize_at_source so calling
6491 code can handle errors and perform rollbacks as necessary.
6492 """
6493 # Fetch and update the instance.info_cache.
6494 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
6495 # Get the BDMs attached to this instance on this source host.
6496 bdms = instance.get_bdms()
6497 # Send the resize.start notification.
6498 self._send_resize_instance_notifications(
6499 ctxt, instance, bdms, network_info, fields.NotificationPhase.START)
6500 # Update the migration status from "pre-migrating" to "migrating".
6501 migration.status = 'migrating'
6502 migration.save()
6504 # Since the instance is going to be left on the source host during the
6505 # resize, we need to power it off so we do not have the instance
6506 # potentially running in two places.
6507 LOG.debug('Stopping instance', instance=instance)
6508 try:
6509 self._power_off_instance(ctxt, instance)
6510 except Exception as e:
6511 LOG.exception('Failed to power off instance.', instance=instance)
6512 raise exception.InstancePowerOffFailure(reason=str(e))
6513 instance.power_state = self._get_power_state(instance)
6515 # If a snapshot image ID was provided, we need to snapshot the guest
6516 # disk image and upload it to the image service.
6517 if snapshot_id:
6518 self._snapshot_for_resize(ctxt, snapshot_id, instance)
6520 block_device_info = self._get_instance_block_device_info(
6521 ctxt, instance, bdms=bdms)
6523 # If something fails at this point the instance must go to ERROR
6524 # status for operator intervention or to reboot/rebuild the instance.
6525 with self._error_out_instance_on_exception(
6526 ctxt, instance, instance_state=vm_states.ERROR):
6528 # Destroy the guest on the source host which will disconnect
6529 # volumes and unplug VIFs. Note that we DO NOT destroy disks since
6530 # we want to leave those on the source host in case of a later
6531 # failure and disks are needed to recover the guest or in case the
6532 # resize is reverted.
6533 LOG.debug('Destroying guest on source host but retaining disks.',
6534 instance=instance)
6535 self.driver.destroy(
6536 ctxt, instance, network_info,
6537 block_device_info=block_device_info, destroy_disks=False)
6539 # At this point the volumes are disconnected from this source host.
6540 # Delete the old volume attachment records and create new empty
6541 # ones which will be used later if the resize is reverted.
6542 LOG.debug('Deleting volume attachments for the source host.',
6543 instance=instance)
6544 self._terminate_volume_connections(ctxt, instance, bdms)
6546 # At this point the VIFs are unplugged from this source host.
6547 # Activate the dest host port bindings created by conductor.
6548 self.network_api.migrate_instance_start(ctxt, instance, migration)
6550 # Update the migration status from "migrating" to "post-migrating".
6551 migration.status = 'post-migrating'
6552 migration.save()
6554 # At this point, the traditional resize_instance would update the
6555 # instance host/node values to point at the dest host/node because
6556 # that is where the disk is transferred during resize_instance, but
6557 # with cross-cell resize the instance is not yet at the dest host
6558 # so we do not make that update here.
6559 instance.task_state = task_states.RESIZE_MIGRATED
6560 instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
6562 self._send_resize_instance_notifications(
6563 ctxt, instance, bdms, network_info,
6564 fields.NotificationPhase.END)
6565 self.instance_events.clear_events_for_instance(instance)
6567 @wrap_exception()
6568 @reverts_task_state
6569 @wrap_instance_event(prefix='compute')
6570 @wrap_instance_fault
6571 def resize_instance(self, context, instance, image,
6572 migration, flavor, clean_shutdown,
6573 request_spec):
6574 """Starts the migration of a running instance to another host.
6576 This is initiated from the destination host's ``prep_resize`` routine
6577 and runs on the source host.
6578 """
6579 try:
6580 self._resize_instance(context, instance, image, migration,
6581 flavor, clean_shutdown, request_spec)
6582 except Exception:
6583 with excutils.save_and_reraise_exception():
6584 self._revert_allocation(context, instance, migration)
6586 def _resize_instance(
6587 self, context, instance, image, migration, flavor,
6588 clean_shutdown, request_spec,
6589 ):
6590 # Pass instance_state=instance.vm_state because we can resize
6591 # a STOPPED server and we don't want to set it back to ACTIVE
6592 # in case migrate_disk_and_power_off raises InstanceFaultRollback.
6593 instance_state = instance.vm_state
6594 with self._error_out_instance_on_exception(
6595 context, instance, instance_state=instance_state), \
6596 errors_out_migration_ctxt(migration):
6597 network_info = self.network_api.get_instance_nw_info(context,
6598 instance)
6600 migration.status = 'migrating'
6601 migration.save()
6603 instance.task_state = task_states.RESIZE_MIGRATING
6604 instance.save(expected_task_state=task_states.RESIZE_PREP)
6606 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
6607 context, instance.uuid)
6608 self._send_resize_instance_notifications(
6609 context, instance, bdms, network_info,
6610 fields.NotificationPhase.START)
6612 block_device_info = self._get_instance_block_device_info(
6613 context, instance, bdms=bdms)
6615 timeout, retry_interval = self._get_power_off_values(
6616 instance, clean_shutdown)
6617 disk_info = self.driver.migrate_disk_and_power_off(
6618 context, instance, migration.dest_host,
6619 flavor, network_info,
6620 block_device_info,
6621 timeout, retry_interval)
6623 self._terminate_volume_connections(context, instance, bdms)
6625 self.network_api.migrate_instance_start(context,
6626 instance,
6627 migration)
6629 migration.status = 'post-migrating'
6630 migration.save()
6632 instance.host = migration.dest_compute
6633 instance.node = migration.dest_node
6634 # NOTE(danms): This could fail if the dest_compute_id was not set
6635 # by the sending host *and* migration.dest_node does not point to
6636 # a legit node on the dest_compute. Since that would be very bad,
6637 # let the ComputeHostNotFound raise from here.
6638 instance.compute_id = migration.get_dest_compute_id()
6639 # NOTE(gibi): as the instance now tracked on the destination we
6640 # have to make sure that the source compute resource track can
6641 # track this instance as a migration. For that the resource tracker
6642 # needs to see the old_flavor set on the instance. The old_flavor
6643 # setting used to be done on the destination host in finish_resize
6644 # but that is racy with a source host update_available_resource
6645 # periodic run
6646 instance.old_flavor = instance.flavor
6647 instance.task_state = task_states.RESIZE_MIGRATED
6648 instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
6650 # RPC cast to the destination host to finish the resize/migration.
6651 self.compute_rpcapi.finish_resize(context, instance,
6652 migration, image, disk_info, migration.dest_compute,
6653 request_spec)
6655 self._send_resize_instance_notifications(
6656 context, instance, bdms, network_info,
6657 fields.NotificationPhase.END)
6658 self.instance_events.clear_events_for_instance(instance)
6660 def _send_resize_instance_notifications(
6661 self, context, instance, bdms, network_info, phase):
6662 """Send "resize.(start|end)" notifications.
6664 :param context: nova auth request context
6665 :param instance: The instance being resized
6666 :param bdms: BlockDeviceMappingList for the BDMs associated with the
6667 instance
6668 :param network_info: NetworkInfo for the instance info cache of ports
6669 :param phase: The phase of the action (NotificationPhase enum, either
6670 ``start`` or ``end``)
6671 """
6672 action = fields.NotificationAction.RESIZE
6673 # Send the legacy unversioned notification.
6674 self._notify_about_instance_usage(
6675 context, instance, "%s.%s" % (action, phase),
6676 network_info=network_info)
6677 # Send the versioned notification.
6678 compute_utils.notify_about_instance_action(
6679 context, instance, self.host, action=action, phase=phase,
6680 bdms=bdms)
6682 def _terminate_volume_connections(self, context, instance, bdms):
6683 connector = None
6684 for bdm in bdms:
6685 if bdm.is_volume:
6686 if bdm.attachment_id:
6687 # NOTE(jdg): So here's the thing, the idea behind the new
6688 # attach API's was to have a new code fork/path that we
6689 # followed, we're not going to do that so we have to do
6690 # some extra work in here to make it *behave* just like the
6691 # old code. Cinder doesn't allow disconnect/reconnect (you
6692 # just delete the attachment and get a new one)
6693 # attachments in the new attach code so we have to do
6694 # a delete and create without a connector (reserve),
6695 # in other words, beware
6696 attachment_id = self.volume_api.attachment_create(
6697 context, bdm.volume_id, instance.uuid)['id']
6698 self.volume_api.attachment_delete(context,
6699 bdm.attachment_id)
6700 bdm.attachment_id = attachment_id
6701 bdm.save()
6703 else:
6704 if connector is None:
6705 connector = self.driver.get_volume_connector(instance)
6706 self.volume_api.terminate_connection(context,
6707 bdm.volume_id,
6708 connector)
6710 @staticmethod
6711 def _set_instance_info(instance, flavor):
6712 instance.instance_type_id = flavor.id
6713 instance.memory_mb = flavor.memory_mb
6714 instance.vcpus = flavor.vcpus
6715 instance.root_gb = flavor.root_gb
6716 instance.ephemeral_gb = flavor.ephemeral_gb
6717 instance.flavor = flavor
6719 def _update_volume_attachments(self, context, instance, bdms):
6720 """Updates volume attachments using the virt driver host connector.
6722 :param context: nova.context.RequestContext - user request context
6723 :param instance: nova.objects.Instance
6724 :param bdms: nova.objects.BlockDeviceMappingList - the list of block
6725 device mappings for the given instance
6726 """
6727 if bdms:
6728 connector = None
6729 for bdm in bdms:
6730 if bdm.is_volume and bdm.attachment_id:
6731 if connector is None: 6731 ↛ 6733line 6731 didn't jump to line 6733 because the condition on line 6731 was always true
6732 connector = self.driver.get_volume_connector(instance)
6733 self.volume_api.attachment_update(
6734 context, bdm.attachment_id, connector, bdm.device_name)
6736 def _complete_volume_attachments(self, context, bdms):
6737 """Completes volume attachments for the instance
6739 :param context: nova.context.RequestContext - user request context
6740 :param bdms: nova.objects.BlockDeviceMappingList - the list of block
6741 device mappings for the given instance
6742 """
6743 if bdms:
6744 for bdm in bdms:
6745 if bdm.is_volume and bdm.attachment_id:
6746 self.volume_api.attachment_complete(
6747 context, bdm.attachment_id)
6749 def _finish_resize(self, context, instance, migration, disk_info,
6750 image_meta, bdms, request_spec):
6751 resize_instance = False # indicates disks have been resized
6752 old_instance_type_id = migration.old_instance_type_id
6753 new_instance_type_id = migration.new_instance_type_id
6754 old_flavor = instance.flavor # the current flavor is now old
6755 # NOTE(mriedem): Get the old_vm_state so we know if we should
6756 # power on the instance. If old_vm_state is not set we need to default
6757 # to ACTIVE for backwards compatibility
6758 old_vm_state = instance.system_metadata.get('old_vm_state',
6759 vm_states.ACTIVE)
6760 # NOTE(gibi): this is already set by the resize_instance on the source
6761 # node before calling finish_resize on destination but during upgrade
6762 # it can be that the source node is not having the fix for bug 1944759
6763 # yet. This assignment can be removed in Z release.
6764 instance.old_flavor = old_flavor
6766 if old_instance_type_id != new_instance_type_id: 6766 ↛ 6775line 6766 didn't jump to line 6775 because the condition on line 6766 was always true
6767 new_flavor = instance.new_flavor # this is set in _prep_resize
6768 # Set the flavor-related fields on the instance object including
6769 # making instance.flavor = new_flavor.
6770 self._set_instance_info(instance, new_flavor)
6771 for key in ('root_gb', 'swap', 'ephemeral_gb'):
6772 if old_flavor[key] != new_flavor[key]:
6773 resize_instance = True
6774 break
6775 instance.apply_migration_context()
6777 # NOTE(tr3buchet): setup networks on destination host
6778 self.network_api.setup_networks_on_host(context, instance,
6779 migration.dest_compute)
6780 provider_mappings = self._get_request_group_mapping(request_spec)
6782 # For neutron, migrate_instance_finish updates port bindings for this
6783 # host including any PCI devices claimed for SR-IOV ports.
6784 self.network_api.migrate_instance_finish(
6785 context, instance, migration, provider_mappings)
6787 network_info = self.network_api.get_instance_nw_info(context, instance)
6789 instance.task_state = task_states.RESIZE_FINISH
6790 instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
6792 self._send_finish_resize_notifications(
6793 context, instance, bdms, network_info,
6794 fields.NotificationPhase.START)
6796 # We need to update any volume attachments using the destination
6797 # host connector so that we can update the BDM.connection_info
6798 # before calling driver.finish_migration otherwise the driver
6799 # won't know how to connect the volumes to this host.
6800 # Note that _get_instance_block_device_info with
6801 # refresh_conn_info=True will update the BDM.connection_info value
6802 # in the database so we must do this before calling that method.
6803 self._update_volume_attachments(context, instance, bdms)
6805 block_device_info = self._get_instance_block_device_info(
6806 context, instance, refresh_conn_info=True, bdms=bdms)
6808 # NOTE(mriedem): If the original vm_state was STOPPED, we don't
6809 # automatically power on the instance after it's migrated
6810 power_on = old_vm_state != vm_states.STOPPED
6812 # NOTE(sbauza): During a migration, the original allocation is against
6813 # the migration UUID while the target allocation (for the destination
6814 # node) is related to the instance UUID, so here we need to pass the
6815 # new ones.
6816 allocations = self.reportclient.get_allocs_for_consumer(
6817 context, instance.uuid)['allocations']
6819 try:
6820 self.driver.finish_migration(context, migration, instance,
6821 disk_info,
6822 network_info,
6823 image_meta, resize_instance,
6824 allocations,
6825 block_device_info, power_on)
6826 except Exception:
6827 # Note that we do not rollback port bindings to the source host
6828 # because resize_instance (on the source host) updated the
6829 # instance.host to point to *this* host (the destination host)
6830 # so the port bindings pointing at this host are correct even
6831 # though we failed to create the guest.
6832 with excutils.save_and_reraise_exception():
6833 # If we failed to create the guest on this host, reset the
6834 # instance flavor-related fields to the old flavor. An
6835 # error handler like reverts_task_state will save the changes.
6836 if old_instance_type_id != new_instance_type_id: 6836 ↛ 6840line 6836 didn't jump to line 6840
6837 self._set_instance_info(instance, old_flavor)
6839 # Now complete any volume attachments that were previously updated.
6840 self._complete_volume_attachments(context, bdms)
6842 migration.status = 'finished'
6843 migration.save()
6845 instance.vm_state = vm_states.RESIZED
6846 instance.task_state = None
6847 instance.launched_at = timeutils.utcnow()
6848 instance.save(expected_task_state=task_states.RESIZE_FINISH)
6850 return network_info
6852 @wrap_exception()
6853 @reverts_task_state
6854 @wrap_instance_event(prefix='compute')
6855 @errors_out_migration
6856 @wrap_instance_fault
6857 def finish_resize(self, context, disk_info, image, instance,
6858 migration, request_spec):
6859 """Completes the migration process.
6861 Sets up the newly transferred disk and turns on the instance at its
6862 new host machine.
6864 """
6865 try:
6866 self._finish_resize_helper(context, disk_info, image, instance,
6867 migration, request_spec)
6868 except Exception:
6869 with excutils.save_and_reraise_exception():
6870 # At this point, resize_instance (which runs on the source) has
6871 # already updated the instance host/node values to point to
6872 # this (the dest) compute, so we need to leave the allocations
6873 # against the dest node resource provider intact and drop the
6874 # allocations against the source node resource provider. If the
6875 # user tries to recover the server by hard rebooting it, it
6876 # will happen on this host so that's where the allocations
6877 # should go. Note that this is the same method called from
6878 # confirm_resize to cleanup the source node allocations held
6879 # by the migration record.
6880 LOG.info('Deleting allocations for old flavor on source node '
6881 '%s after finish_resize failure. You may be able to '
6882 'recover the instance by hard rebooting it.',
6883 migration.source_compute, instance=instance)
6884 self._delete_allocation_after_move(
6885 context, instance, migration)
6887 def _finish_resize_helper(self, context, disk_info, image, instance,
6888 migration, request_spec):
6889 """Completes the migration process.
6891 The caller must revert the instance's allocations if the migration
6892 process failed.
6893 """
6894 bdms = self._update_bdm_for_swap_to_finish_resize(context, instance)
6896 with self._error_out_instance_on_exception(context, instance):
6897 image_meta = objects.ImageMeta.from_dict(image)
6898 network_info = self._finish_resize(context, instance, migration,
6899 disk_info, image_meta, bdms,
6900 request_spec)
6902 # TODO(melwitt): We should clean up instance console tokens here. The
6903 # instance is on a new host and will need to establish a new console
6904 # connection.
6905 self._update_scheduler_instance_info(context, instance)
6906 self._send_finish_resize_notifications(
6907 context, instance, bdms, network_info,
6908 fields.NotificationPhase.END)
6910 def _send_finish_resize_notifications(
6911 self, context, instance, bdms, network_info, phase):
6912 """Send notifications for the finish_resize flow.
6914 :param context: nova auth request context
6915 :param instance: The instance being resized
6916 :param bdms: BlockDeviceMappingList for the BDMs associated with the
6917 instance
6918 :param network_info: NetworkInfo for the instance info cache of ports
6919 :param phase: The phase of the action (NotificationPhase enum, either
6920 ``start`` or ``end``)
6921 """
6922 # Send the legacy unversioned notification.
6923 self._notify_about_instance_usage(
6924 context, instance, "finish_resize.%s" % phase,
6925 network_info=network_info)
6926 # Send the versioned notification.
6927 compute_utils.notify_about_instance_action(
6928 context, instance, self.host,
6929 action=fields.NotificationAction.RESIZE_FINISH, phase=phase,
6930 bdms=bdms)
6932 @wrap_exception()
6933 @reverts_task_state
6934 @wrap_instance_event(prefix='compute')
6935 @errors_out_migration
6936 @wrap_instance_fault
6937 def finish_snapshot_based_resize_at_dest(
6938 self, ctxt, instance, migration, snapshot_id):
6939 """Finishes the snapshot-based resize at the destination compute.
6941 Sets up block devices and networking on the destination compute and
6942 spawns the guest.
6944 :param ctxt: nova auth request context targeted at the target cell DB
6945 :param instance: The Instance object being resized with the
6946 ``migration_context`` field set. Upon successful completion of this
6947 method the vm_state should be "resized", the task_state should be
6948 None, and migration context, host/node and flavor-related fields
6949 should be set on the instance.
6950 :param migration: The Migration object for this resize operation. Upon
6951 successful completion of this method the migration status should
6952 be "finished".
6953 :param snapshot_id: ID of the image snapshot created for a
6954 non-volume-backed instance, else None.
6955 """
6956 LOG.info('Finishing snapshot based resize on destination host %s.',
6957 self.host, instance=instance)
6958 with self._error_out_instance_on_exception(ctxt, instance):
6959 # Note that if anything fails here, the migration-based allocations
6960 # created in conductor should be reverted by conductor as well,
6961 # see MigrationTask.rollback.
6962 self._finish_snapshot_based_resize_at_dest(
6963 ctxt, instance, migration, snapshot_id)
6965 def _finish_snapshot_based_resize_at_dest(
6966 self, ctxt, instance, migration, snapshot_id):
6967 """Private variant of finish_snapshot_based_resize_at_dest so the
6968 caller can handle reverting resource allocations on failure and perform
6969 other generic error handling.
6970 """
6971 # Figure out the image metadata to use when spawning the guest.
6972 origin_image_ref = instance.image_ref
6973 if snapshot_id:
6974 instance.image_ref = snapshot_id
6975 image_meta = objects.ImageMeta.from_image_ref(
6976 ctxt, self.image_api, snapshot_id)
6977 else:
6978 # Just use what is already on the volume-backed instance.
6979 image_meta = instance.image_meta
6981 resize = migration.migration_type == 'resize'
6982 instance.old_flavor = instance.flavor
6983 if resize: 6983 ↛ 6991line 6983 didn't jump to line 6991 because the condition on line 6983 was always true
6984 flavor = instance.new_flavor
6985 # If we are resizing to a new flavor we need to set the
6986 # flavor-related fields on the instance.
6987 # NOTE(mriedem): This is likely where storing old/new_flavor on
6988 # the MigrationContext would make this cleaner.
6989 self._set_instance_info(instance, flavor)
6991 instance.apply_migration_context()
6992 instance.task_state = task_states.RESIZE_FINISH
6993 instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
6995 # This seems a bit late to be sending the start notification but
6996 # it is what traditional resize has always done as well and it does
6997 # contain the changes to the instance with the new_flavor and
6998 # task_state.
6999 bdms = instance.get_bdms()
7000 network_info = instance.get_network_info()
7001 self._send_finish_resize_notifications(
7002 ctxt, instance, bdms, network_info,
7003 fields.NotificationPhase.START)
7005 # Setup volumes and networking and spawn the guest in the hypervisor.
7006 self._finish_snapshot_based_resize_at_dest_spawn(
7007 ctxt, instance, migration, image_meta, bdms)
7009 # If we spawned from a temporary snapshot image we can delete that now,
7010 # similar to how unshelve works.
7011 if snapshot_id:
7012 instance.image_ref = origin_image_ref
7013 compute_utils.delete_image(
7014 ctxt, instance, self.image_api, snapshot_id)
7016 migration.status = 'finished'
7017 migration.save()
7019 self._update_instance_after_spawn(instance, vm_state=vm_states.RESIZED)
7020 # Setting the host/node values will make the ResourceTracker continue
7021 # to track usage for this instance on this host.
7022 instance.host = migration.dest_compute
7023 # NOTE(danms): This could fail if we somehow ended up on the wrong
7024 # host without the desired node. That's a big problem, so let the
7025 # ComputeHostNotFound raise from here.
7026 cn = self.rt.get_node_by_name(migration.dest_node)
7027 instance.compute_id = cn.id
7028 instance.node = cn.hypervisor_hostname
7029 instance.save(expected_task_state=task_states.RESIZE_FINISH)
7031 # Broadcast to all schedulers that the instance is on this host.
7032 self._update_scheduler_instance_info(ctxt, instance)
7033 self._send_finish_resize_notifications(
7034 ctxt, instance, bdms, network_info,
7035 fields.NotificationPhase.END)
7037 def _finish_snapshot_based_resize_at_dest_spawn(
7038 self, ctxt, instance, migration, image_meta, bdms):
7039 """Sets up volumes and networking and spawns the guest on the dest host
7041 If the instance was stopped when the resize was initiated the guest
7042 will be created but remain in a shutdown power state.
7044 If the spawn fails, port bindings are rolled back to the source host
7045 and volume connections are terminated for this dest host.
7047 :param ctxt: nova auth request context
7048 :param instance: Instance object being migrated
7049 :param migration: Migration object for the operation
7050 :param image_meta: ImageMeta object used during driver.spawn
7051 :param bdms: BlockDeviceMappingList of BDMs for the instance
7052 """
7053 # Update the volume attachments using this host's connector.
7054 # That will update the BlockDeviceMapping.connection_info which
7055 # will be used to connect the volumes on this host during spawn().
7056 block_device_info = self._prep_block_device(ctxt, instance, bdms)
7058 allocations = self.reportclient.get_allocations_for_consumer(
7059 ctxt, instance.uuid)
7061 # We do not call self.network_api.setup_networks_on_host here because
7062 # for neutron that sets up the port migration profile which is only
7063 # used during live migration with DVR. Yes it is gross knowing what
7064 # that method does internally. We could change this when bug 1814837
7065 # is fixed if setup_networks_on_host is made smarter by passing the
7066 # migration record and the method checks the migration_type.
7068 # Activate the port bindings for this host.
7069 # FIXME(mriedem): We're going to have the same issue as bug 1813789
7070 # here because this will update the port bindings and send the
7071 # network-vif-plugged event and that means when driver.spawn waits for
7072 # it we might have already gotten the event and neutron won't send
7073 # another one so we could timeout.
7074 # TODO(mriedem): Calculate provider mappings when we support cross-cell
7075 # resize/migrate with ports having resource requests.
7076 self.network_api.migrate_instance_finish(
7077 ctxt, instance, migration, provider_mappings=None)
7078 network_info = self.network_api.get_instance_nw_info(ctxt, instance)
7080 # If the original vm_state was STOPPED, we do not automatically
7081 # power on the instance after it is migrated.
7082 power_on = instance.system_metadata['old_vm_state'] == vm_states.ACTIVE
7083 try:
7084 # NOTE(mriedem): If this instance uses a config drive, it will get
7085 # rebuilt here which means any personality files will be lost,
7086 # similar to unshelve. If the instance is not using a config drive
7087 # and getting metadata from the metadata API service, personality
7088 # files would be lost regardless of the move operation.
7089 self.driver.spawn(
7090 ctxt, instance, image_meta, injected_files=[],
7091 admin_password=None, allocations=allocations,
7092 network_info=network_info, block_device_info=block_device_info,
7093 power_on=power_on)
7094 except Exception:
7095 with excutils.save_and_reraise_exception(logger=LOG):
7096 # Rollback port bindings to the source host.
7097 try:
7098 # This is gross but migrate_instance_start looks at the
7099 # migration.dest_compute to determine where to activate the
7100 # port bindings and we want the source compute port
7101 # bindings to be re-activated. Remember at this point the
7102 # instance.host is still pointing at the source compute.
7103 # TODO(mriedem): Maybe we should be calling
7104 # setup_instance_network_on_host here to deal with pci
7105 # devices?
7106 with utils.temporary_mutation(
7107 migration, dest_compute=migration.source_compute):
7108 self.network_api.migrate_instance_start(
7109 ctxt, instance, migration)
7110 except Exception:
7111 LOG.exception(
7112 'Failed to activate port bindings on the source '
7113 'host: %s', migration.source_compute,
7114 instance=instance)
7116 # Rollback volume connections on this host.
7117 for bdm in bdms:
7118 if bdm.is_volume:
7119 try:
7120 self._remove_volume_connection(
7121 ctxt, bdm, instance, delete_attachment=True)
7122 except Exception:
7123 LOG.exception('Failed to remove volume connection '
7124 'on this host %s for volume %s.',
7125 self.host, bdm.volume_id,
7126 instance=instance)
7128 @wrap_exception()
7129 @wrap_instance_fault
7130 def add_fixed_ip_to_instance(self, context, network_id, instance):
7131 """Calls network_api to add new fixed_ip to instance
7132 then injects the new network info and resets instance networking.
7134 """
7135 self._notify_about_instance_usage(
7136 context, instance, "create_ip.start")
7138 network_info = self.network_api.add_fixed_ip_to_instance(context,
7139 instance,
7140 network_id)
7141 self._inject_network_info(instance, network_info)
7143 # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
7144 instance.updated_at = timeutils.utcnow()
7145 instance.save()
7147 self._notify_about_instance_usage(
7148 context, instance, "create_ip.end", network_info=network_info)
7150 @wrap_exception()
7151 @wrap_instance_fault
7152 def remove_fixed_ip_from_instance(self, context, address, instance):
7153 """Calls network_api to remove existing fixed_ip from instance
7154 by injecting the altered network info and resetting
7155 instance networking.
7156 """
7157 self._notify_about_instance_usage(
7158 context, instance, "delete_ip.start")
7160 network_info = self.network_api.remove_fixed_ip_from_instance(context,
7161 instance,
7162 address)
7163 self._inject_network_info(instance, network_info)
7165 # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
7166 instance.updated_at = timeutils.utcnow()
7167 instance.save()
7169 self._notify_about_instance_usage(
7170 context, instance, "delete_ip.end", network_info=network_info)
7172 @wrap_exception()
7173 @reverts_task_state
7174 @wrap_instance_event(prefix='compute')
7175 @wrap_instance_fault
7176 def pause_instance(self, context, instance):
7177 """Pause an instance on this host."""
7178 context = context.elevated()
7179 LOG.info('Pausing', instance=instance)
7180 self._notify_about_instance_usage(context, instance, 'pause.start')
7181 compute_utils.notify_about_instance_action(context, instance,
7182 self.host, action=fields.NotificationAction.PAUSE,
7183 phase=fields.NotificationPhase.START)
7184 self.driver.pause(instance)
7185 instance.power_state = self._get_power_state(instance)
7186 instance.vm_state = vm_states.PAUSED
7187 instance.task_state = None
7188 instance.save(expected_task_state=task_states.PAUSING)
7189 self._notify_about_instance_usage(context, instance, 'pause.end')
7190 compute_utils.notify_about_instance_action(context, instance,
7191 self.host, action=fields.NotificationAction.PAUSE,
7192 phase=fields.NotificationPhase.END)
7194 @wrap_exception()
7195 @reverts_task_state
7196 @wrap_instance_event(prefix='compute')
7197 @wrap_instance_fault
7198 def unpause_instance(self, context, instance):
7199 """Unpause a paused instance on this host."""
7200 context = context.elevated()
7201 LOG.info('Unpausing', instance=instance)
7202 self._notify_about_instance_usage(context, instance, 'unpause.start')
7203 compute_utils.notify_about_instance_action(context, instance,
7204 self.host, action=fields.NotificationAction.UNPAUSE,
7205 phase=fields.NotificationPhase.START)
7206 self.driver.unpause(instance)
7207 instance.power_state = self._get_power_state(instance)
7208 instance.vm_state = vm_states.ACTIVE
7209 instance.task_state = None
7210 instance.save(expected_task_state=task_states.UNPAUSING)
7211 self._notify_about_instance_usage(context, instance, 'unpause.end')
7212 compute_utils.notify_about_instance_action(context, instance,
7213 self.host, action=fields.NotificationAction.UNPAUSE,
7214 phase=fields.NotificationPhase.END)
7216 @wrap_exception()
7217 def host_power_action(self, context, action):
7218 """Reboots, shuts down or powers up the host."""
7219 return self.driver.host_power_action(action)
7221 @wrap_exception()
7222 def host_maintenance_mode(self, context, host, mode):
7223 """Start/Stop host maintenance window. On start, it triggers
7224 guest VMs evacuation.
7225 """
7226 return self.driver.host_maintenance_mode(host, mode)
7228 def _update_compute_provider_status(self, context, enabled):
7229 """Adds or removes the COMPUTE_STATUS_DISABLED trait for this host.
7231 For each ComputeNode managed by this service, adds or removes the
7232 COMPUTE_STATUS_DISABLED traits to/from the associated resource provider
7233 in Placement.
7235 :param context: nova auth RequestContext
7236 :param enabled: True if the node is enabled in which case the trait
7237 would be removed, False if the node is disabled in which case
7238 the trait would be added.
7239 :raises: ComputeHostNotFound if there are no compute nodes found in
7240 the ResourceTracker for this service.
7241 """
7242 # Get the compute node(s) on this host. Remember that ironic can be
7243 # managing more than one compute node.
7244 nodes = self.rt.compute_nodes.values()
7245 if not nodes:
7246 raise exception.ComputeHostNotFound(host=self.host)
7247 # For each node, we want to add (or remove) the COMPUTE_STATUS_DISABLED
7248 # trait on the related resource provider in placement so the scheduler
7249 # (pre-)filters the provider based on its status.
7250 for node in nodes:
7251 try:
7252 self.virtapi.update_compute_provider_status(
7253 context, node.uuid, enabled)
7254 except (exception.ResourceProviderTraitRetrievalFailed,
7255 exception.ResourceProviderUpdateConflict,
7256 exception.ResourceProviderUpdateFailed,
7257 exception.TraitRetrievalFailed) as e:
7258 # This is best effort so just log a warning and continue.
7259 LOG.warning('An error occurred while updating '
7260 'COMPUTE_STATUS_DISABLED trait on compute node '
7261 'resource provider %s. The trait will be '
7262 'synchronized when the update_available_resource '
7263 'periodic task runs. Error: %s',
7264 node.uuid, e.format_message())
7265 except Exception:
7266 LOG.exception('An error occurred while updating '
7267 'COMPUTE_STATUS_DISABLED trait on compute node '
7268 'resource provider %s. The trait will be '
7269 'synchronized when the '
7270 'update_available_resource periodic task runs.',
7271 node.uuid)
7273 @wrap_exception()
7274 def set_host_enabled(self, context, enabled):
7275 """Sets the specified host's ability to accept new instances.
7277 This method will add or remove the COMPUTE_STATUS_DISABLED trait
7278 to/from the associated compute node resource provider(s) for this
7279 compute service.
7280 """
7281 try:
7282 self._update_compute_provider_status(context, enabled)
7283 except exception.ComputeHostNotFound:
7284 LOG.warning('Unable to add/remove trait COMPUTE_STATUS_DISABLED. '
7285 'No ComputeNode(s) found for host: %s', self.host)
7287 try:
7288 return self.driver.set_host_enabled(enabled)
7289 except NotImplementedError:
7290 # Only the xenapi driver implements set_host_enabled but we don't
7291 # want NotImplementedError to get raised back to the API. We still
7292 # need to honor the compute RPC API contract and return 'enabled'
7293 # or 'disabled' though.
7294 return 'enabled' if enabled else 'disabled'
7296 @wrap_exception()
7297 def get_host_uptime(self, context):
7298 """Returns the result of calling "uptime" on the target host."""
7299 return self.driver.get_host_uptime()
7301 @wrap_exception()
7302 @wrap_instance_fault
7303 def get_diagnostics(self, context, instance):
7304 """Retrieve diagnostics for an instance on this host."""
7305 current_power_state = self._get_power_state(instance)
7306 if current_power_state == power_state.RUNNING: 7306 ↛ 7310line 7306 didn't jump to line 7310 because the condition on line 7306 was always true
7307 LOG.info("Retrieving diagnostics", instance=instance)
7308 return self.driver.get_diagnostics(instance)
7309 else:
7310 raise exception.InstanceInvalidState(
7311 attr='power state',
7312 instance_uuid=instance.uuid,
7313 state=power_state.STATE_MAP[instance.power_state],
7314 method='get_diagnostics')
7316 @wrap_exception()
7317 @wrap_instance_fault
7318 def get_instance_diagnostics(self, context, instance):
7319 """Retrieve diagnostics for an instance on this host."""
7320 current_power_state = self._get_power_state(instance)
7321 if current_power_state == power_state.RUNNING: 7321 ↛ 7325line 7321 didn't jump to line 7325 because the condition on line 7321 was always true
7322 LOG.info("Retrieving diagnostics", instance=instance)
7323 return self.driver.get_instance_diagnostics(instance)
7324 else:
7325 raise exception.InstanceInvalidState(
7326 attr='power state',
7327 instance_uuid=instance.uuid,
7328 state=power_state.STATE_MAP[instance.power_state],
7329 method='get_diagnostics')
7331 @wrap_exception()
7332 @reverts_task_state
7333 @wrap_instance_event(prefix='compute')
7334 @wrap_instance_fault
7335 def suspend_instance(self, context, instance):
7336 """Suspend the given instance."""
7337 context = context.elevated()
7339 # Store the old state
7340 instance.system_metadata['old_vm_state'] = instance.vm_state
7341 self._notify_about_instance_usage(context, instance, 'suspend.start')
7342 compute_utils.notify_about_instance_action(context, instance,
7343 self.host, action=fields.NotificationAction.SUSPEND,
7344 phase=fields.NotificationPhase.START)
7345 with self._error_out_instance_on_exception(context, instance,
7346 instance_state=instance.vm_state):
7347 self.driver.suspend(context, instance)
7348 instance.power_state = self._get_power_state(instance)
7349 instance.vm_state = vm_states.SUSPENDED
7350 instance.task_state = None
7351 instance.save(expected_task_state=task_states.SUSPENDING)
7352 self._notify_about_instance_usage(context, instance, 'suspend.end')
7353 compute_utils.notify_about_instance_action(context, instance,
7354 self.host, action=fields.NotificationAction.SUSPEND,
7355 phase=fields.NotificationPhase.END)
7357 @wrap_exception()
7358 @reverts_task_state
7359 @wrap_instance_event(prefix='compute')
7360 @wrap_instance_fault
7361 def resume_instance(self, context, instance):
7362 """Resume the given suspended instance."""
7363 context = context.elevated()
7364 LOG.info('Resuming', instance=instance)
7366 self._notify_about_instance_usage(context, instance, 'resume.start')
7368 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
7369 context, instance.uuid)
7370 block_device_info = self._get_instance_block_device_info(
7371 context, instance, bdms=bdms)
7373 # This allows passing share_info to the resume operation for
7374 # futur usage. However, this scenario is currently not possible
7375 # because suspending an instance with a share is not permitted
7376 # by libvirt. As a result, the suspend action involving a share
7377 # is blocked by the API.
7378 share_info = self._get_share_info(context, instance)
7380 compute_utils.notify_about_instance_action(context, instance,
7381 self.host, action=fields.NotificationAction.RESUME,
7382 phase=fields.NotificationPhase.START, bdms=bdms)
7384 network_info = self.network_api.get_instance_nw_info(context, instance)
7386 with self._error_out_instance_on_exception(context, instance,
7387 instance_state=instance.vm_state):
7388 self.driver.resume(context, instance, network_info,
7389 block_device_info, share_info)
7391 instance.power_state = self._get_power_state(instance)
7393 # We default to the ACTIVE state for backwards compatibility
7394 instance.vm_state = instance.system_metadata.pop('old_vm_state',
7395 vm_states.ACTIVE)
7397 instance.task_state = None
7398 instance.save(expected_task_state=task_states.RESUMING)
7399 self._notify_about_instance_usage(context, instance, 'resume.end')
7400 compute_utils.notify_about_instance_action(context, instance,
7401 self.host, action=fields.NotificationAction.RESUME,
7402 phase=fields.NotificationPhase.END, bdms=bdms)
7404 @wrap_exception()
7405 @reverts_task_state
7406 @wrap_instance_event(prefix='compute')
7407 @wrap_instance_fault
7408 def shelve_instance(self, context, instance, image_id,
7409 clean_shutdown, accel_uuids):
7410 """Shelve an instance.
7412 This should be used when you want to take a snapshot of the instance.
7413 It also adds system_metadata that can be used by a periodic task to
7414 offload the shelved instance after a period of time.
7416 :param context: request context
7417 :param instance: an Instance object
7418 :param image_id: an image id to snapshot to.
7419 :param clean_shutdown: give the GuestOS a chance to stop
7420 :param accel_uuids: the accelerators uuids for the instance
7421 """
7423 @utils.synchronized(instance.uuid)
7424 def do_shelve_instance():
7425 self._shelve_instance(context, instance, image_id, clean_shutdown,
7426 accel_uuids)
7427 do_shelve_instance()
7429 def _shelve_instance(self, context, instance, image_id,
7430 clean_shutdown, accel_uuids=None):
7431 LOG.info('Shelving', instance=instance)
7432 offload = CONF.shelved_offload_time == 0
7433 if offload:
7434 # Get the BDMs early so we can pass them into versioned
7435 # notifications since _shelve_offload_instance needs the
7436 # BDMs anyway.
7437 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
7438 context, instance.uuid)
7439 else:
7440 bdms = None
7441 compute_utils.notify_usage_exists(self.notifier, context, instance,
7442 self.host, current_period=True)
7443 self._notify_about_instance_usage(context, instance, 'shelve.start')
7444 compute_utils.notify_about_instance_action(context, instance,
7445 self.host, action=fields.NotificationAction.SHELVE,
7446 phase=fields.NotificationPhase.START, bdms=bdms)
7448 def update_task_state(task_state, expected_state=task_states.SHELVING):
7449 shelving_state_map = {
7450 task_states.IMAGE_PENDING_UPLOAD:
7451 task_states.SHELVING_IMAGE_PENDING_UPLOAD,
7452 task_states.IMAGE_UPLOADING:
7453 task_states.SHELVING_IMAGE_UPLOADING,
7454 task_states.SHELVING: task_states.SHELVING}
7455 task_state = shelving_state_map[task_state]
7456 expected_state = shelving_state_map[expected_state]
7457 instance.task_state = task_state
7458 instance.save(expected_task_state=expected_state)
7459 # Do not attempt a clean shutdown of a paused guest since some
7460 # hypervisors will fail the clean shutdown if the guest is not
7461 # running.
7462 if instance.power_state == power_state.PAUSED:
7463 clean_shutdown = False
7464 self._power_off_instance(context, instance, clean_shutdown)
7465 self.driver.snapshot(context, instance, image_id, update_task_state)
7467 instance.system_metadata['shelved_at'] = timeutils.utcnow().isoformat()
7468 instance.system_metadata['shelved_image_id'] = image_id
7469 instance.system_metadata['shelved_host'] = self.host
7470 instance.vm_state = vm_states.SHELVED
7471 instance.task_state = None
7472 if offload:
7473 instance.task_state = task_states.SHELVING_OFFLOADING
7474 instance.power_state = self._get_power_state(instance)
7475 instance.save(expected_task_state=[
7476 task_states.SHELVING,
7477 task_states.SHELVING_IMAGE_UPLOADING])
7479 self._notify_about_instance_usage(context, instance, 'shelve.end')
7480 compute_utils.notify_about_instance_action(context, instance,
7481 self.host, action=fields.NotificationAction.SHELVE,
7482 phase=fields.NotificationPhase.END, bdms=bdms)
7484 if offload:
7485 self._shelve_offload_instance(
7486 context, instance, clean_shutdown=False, bdms=bdms,
7487 accel_uuids=accel_uuids)
7489 @wrap_exception()
7490 @reverts_task_state
7491 @wrap_instance_event(prefix='compute')
7492 @wrap_instance_fault
7493 def shelve_offload_instance(self, context, instance, clean_shutdown,
7494 accel_uuids):
7495 """Remove a shelved instance from the hypervisor.
7497 This frees up those resources for use by other instances, but may lead
7498 to slower unshelve times for this instance. This method is used by
7499 volume backed instances since restoring them doesn't involve the
7500 potentially large download of an image.
7502 :param context: request context
7503 :param instance: nova.objects.instance.Instance
7504 :param clean_shutdown: give the GuestOS a chance to stop
7505 :param accel_uuids: the accelerators uuids for the instance
7506 """
7508 @utils.synchronized(instance.uuid)
7509 def do_shelve_offload_instance():
7510 self._shelve_offload_instance(context, instance, clean_shutdown,
7511 accel_uuids=accel_uuids)
7512 do_shelve_offload_instance()
7514 def _shelve_offload_instance(self, context, instance, clean_shutdown,
7515 bdms=None, accel_uuids=None):
7516 LOG.info('Shelve offloading', instance=instance)
7517 if bdms is None:
7518 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
7519 context, instance.uuid)
7520 self._notify_about_instance_usage(context, instance,
7521 'shelve_offload.start')
7522 compute_utils.notify_about_instance_action(context, instance,
7523 self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
7524 phase=fields.NotificationPhase.START, bdms=bdms)
7526 self._power_off_instance(context, instance, clean_shutdown)
7527 current_power_state = self._get_power_state(instance)
7528 network_info = self.network_api.get_instance_nw_info(context, instance)
7530 ports_id = [vif['id'] for vif in network_info]
7531 self.network_api.unbind_ports(context, ports_id, detach=False)
7533 block_device_info = self._get_instance_block_device_info(context,
7534 instance,
7535 bdms=bdms)
7536 self.driver.destroy(context, instance, network_info,
7537 block_device_info)
7539 # the instance is going to be removed from the host so we want to
7540 # terminate all the connections with the volume server and the host
7541 self._terminate_volume_connections(context, instance, bdms)
7543 # NOTE(brinzhang): Free up the accelerator resource occupied
7544 # in the cyborg service.
7545 if accel_uuids:
7546 cyclient = cyborg.get_client(context)
7547 cyclient.delete_arqs_for_instance(instance.uuid)
7549 # Free up the resource allocations in the placement service.
7550 # This should happen *before* the vm_state is changed to
7551 # SHELVED_OFFLOADED in case client-side code is polling the API to
7552 # schedule more instances (or unshelve) once this server is offloaded.
7553 self.rt.delete_allocation_for_shelve_offloaded_instance(context,
7554 instance)
7556 instance.power_state = current_power_state
7557 # NOTE(mriedem): The vm_state has to be set before updating the
7558 # resource tracker, see vm_states.allow_resource_removal(). The
7559 # host/node values cannot be nulled out until after updating the
7560 # resource tracker though.
7561 instance.vm_state = vm_states.SHELVED_OFFLOADED
7562 instance.task_state = None
7563 instance.save(expected_task_state=[task_states.SHELVING,
7564 task_states.SHELVING_OFFLOADING])
7566 # NOTE(ndipanov): Free resources from the resource tracker
7567 self._update_resource_tracker(context, instance)
7569 # NOTE(sfinucan): RPC calls should no longer be attempted against this
7570 # instance, so ensure any calls result in errors
7571 self._nil_out_instance_obj_host_and_node(instance)
7572 instance.save(expected_task_state=None)
7574 # TODO(melwitt): We should clean up instance console tokens here. The
7575 # instance has no host at this point and will need to establish a new
7576 # console connection in the future after it is unshelved.
7577 self._delete_scheduler_instance_info(context, instance.uuid)
7578 self._notify_about_instance_usage(context, instance,
7579 'shelve_offload.end')
7580 compute_utils.notify_about_instance_action(context, instance,
7581 self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
7582 phase=fields.NotificationPhase.END, bdms=bdms)
7584 @wrap_exception()
7585 @reverts_task_state
7586 @wrap_instance_event(prefix='compute')
7587 @wrap_instance_fault
7588 def unshelve_instance(self, context, instance, image,
7589 filter_properties, node, request_spec, accel_uuids):
7590 """Unshelve the instance.
7592 :param context: request context
7593 :param instance: a nova.objects.instance.Instance object
7594 :param image: an image to build from. If None we assume a
7595 volume backed instance.
7596 :param filter_properties: dict containing limits, retry info etc.
7597 :param node: target compute node
7598 :param request_spec: the RequestSpec object used to schedule the
7599 instance
7600 :param accel_uuids: the accelerators uuids for the instance
7601 """
7602 if filter_properties is None: 7602 ↛ 7603line 7602 didn't jump to line 7603 because the condition on line 7602 was never true
7603 filter_properties = {}
7605 @utils.synchronized(instance.uuid)
7606 def do_unshelve_instance():
7607 self._unshelve_instance(
7608 context, instance, image, filter_properties, node,
7609 request_spec, accel_uuids)
7610 do_unshelve_instance()
7612 def _unshelve_instance_key_scrub(self, instance):
7613 """Remove data from the instance that may cause side effects."""
7614 cleaned_keys = dict(
7615 key_data=instance.key_data,
7616 auto_disk_config=instance.auto_disk_config)
7617 instance.key_data = None
7618 instance.auto_disk_config = False
7619 return cleaned_keys
7621 def _unshelve_instance_key_restore(self, instance, keys):
7622 """Restore previously scrubbed keys before saving the instance."""
7623 instance.update(keys)
7625 def _unshelve_instance(self, context, instance, image, filter_properties,
7626 node, request_spec, accel_uuids):
7627 LOG.info('Unshelving', instance=instance)
7628 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
7629 context, instance.uuid)
7631 self._notify_about_instance_usage(context, instance, 'unshelve.start')
7632 compute_utils.notify_about_instance_action(context, instance,
7633 self.host, action=fields.NotificationAction.UNSHELVE,
7634 phase=fields.NotificationPhase.START, bdms=bdms)
7636 instance.task_state = task_states.SPAWNING
7637 instance.save()
7639 block_device_info = self._prep_block_device(context, instance, bdms)
7640 scrubbed_keys = self._unshelve_instance_key_scrub(instance)
7642 if node is None: 7642 ↛ 7643line 7642 didn't jump to line 7643 because the condition on line 7642 was never true
7643 node = self._get_nodename(instance)
7645 limits = filter_properties.get('limits', {})
7647 allocations = self.reportclient.get_allocations_for_consumer(
7648 context, instance.uuid)
7650 shelved_image_ref = instance.image_ref
7651 if image:
7652 instance.image_ref = image['id']
7653 image_meta = objects.ImageMeta.from_dict(image)
7654 else:
7655 image_meta = objects.ImageMeta.from_dict(
7656 utils.get_image_from_system_metadata(
7657 instance.system_metadata))
7659 provider_mappings = self._get_request_group_mapping(request_spec)
7661 try:
7662 if provider_mappings:
7663 compute_utils.update_pci_request_with_placement_allocations(
7664 context,
7665 self.reportclient,
7666 instance.pci_requests.requests,
7667 provider_mappings,
7668 )
7670 accel_info = []
7671 if accel_uuids:
7672 try:
7673 accel_info = self._get_bound_arq_resources(
7674 context, instance, accel_uuids)
7675 except (Exception, eventlet.timeout.Timeout) as exc:
7676 LOG.exception('Failure getting accelerator requests '
7677 'with the exception: %s', exc,
7678 instance=instance)
7679 self._build_resources_cleanup(instance, None)
7680 raise
7682 with self.rt.instance_claim(context, instance, node, allocations,
7683 limits):
7684 self.network_api.setup_instance_network_on_host(
7685 context, instance, self.host,
7686 provider_mappings=provider_mappings)
7687 network_info = self.network_api.get_instance_nw_info(
7688 context, instance)
7690 self.driver.spawn(context, instance, image_meta,
7691 injected_files=[],
7692 admin_password=None,
7693 allocations=allocations,
7694 network_info=network_info,
7695 block_device_info=block_device_info,
7696 accel_info=accel_info)
7697 except Exception:
7698 with excutils.save_and_reraise_exception(logger=LOG):
7699 LOG.exception('Instance failed to spawn',
7700 instance=instance)
7701 # Set the image_ref back to initial image_ref because instance
7702 # object might have been saved with image['id']
7703 # https://bugs.launchpad.net/nova/+bug/1934094
7704 instance.image_ref = shelved_image_ref
7705 # Cleanup allocations created by the scheduler on this host
7706 # since we failed to spawn the instance. We do this both if
7707 # the instance claim failed with ComputeResourcesUnavailable
7708 # or if we did claim but the spawn failed, because aborting the
7709 # instance claim will not remove the allocations.
7710 self.reportclient.delete_allocation_for_instance(
7711 context, instance.uuid, force=True)
7712 # FIXME: Umm, shouldn't we be rolling back port bindings too?
7713 self._terminate_volume_connections(context, instance, bdms)
7714 # The reverts_task_state decorator on unshelve_instance will
7715 # eventually save these updates.
7716 self._nil_out_instance_obj_host_and_node(instance)
7718 if image:
7719 instance.image_ref = shelved_image_ref
7720 self._delete_snapshot_of_shelved_instance(context, instance,
7721 image['id'])
7723 self._unshelve_instance_key_restore(instance, scrubbed_keys)
7724 self._update_instance_after_spawn(instance)
7725 # Delete system_metadata for a shelved instance
7726 compute_utils.remove_shelved_keys_from_system_metadata(instance)
7728 instance.save(expected_task_state=task_states.SPAWNING)
7729 self._update_scheduler_instance_info(context, instance)
7730 self._notify_about_instance_usage(context, instance, 'unshelve.end')
7731 compute_utils.notify_about_instance_action(context, instance,
7732 self.host, action=fields.NotificationAction.UNSHELVE,
7733 phase=fields.NotificationPhase.END, bdms=bdms)
7735 def _inject_network_info(self, instance, network_info):
7736 """Inject network info for the given instance."""
7737 LOG.debug('Inject network info', instance=instance)
7738 LOG.debug('network_info to inject: |%s|', network_info,
7739 instance=instance)
7741 self.driver.inject_network_info(instance, network_info)
7743 @wrap_instance_fault
7744 def inject_network_info(self, context, instance):
7745 """Inject network info, but don't return the info."""
7746 network_info = self.network_api.get_instance_nw_info(context, instance)
7747 self._inject_network_info(instance, network_info)
7749 @messaging.expected_exceptions(NotImplementedError,
7750 exception.ConsoleNotAvailable,
7751 exception.InstanceNotFound)
7752 @wrap_exception()
7753 @wrap_instance_fault
7754 def get_console_output(self, context, instance, tail_length):
7755 """Send the console output for the given instance."""
7756 context = context.elevated()
7757 LOG.info("Get console output", instance=instance)
7758 output = self.driver.get_console_output(context, instance)
7760 if type(output) is str: 7760 ↛ 7763line 7760 didn't jump to line 7763 because the condition on line 7760 was always true
7761 output = output.encode("latin-1")
7763 if tail_length is not None:
7764 output = self._tail_log(output, tail_length)
7766 return output.decode('ascii', 'replace')
7768 def _tail_log(self, log, length):
7769 try:
7770 length = int(length)
7771 except ValueError:
7772 length = 0
7774 if length == 0: 7774 ↛ 7775line 7774 didn't jump to line 7775 because the condition on line 7774 was never true
7775 return b''
7776 else:
7777 return b'\n'.join(log.split(b'\n')[-int(length):])
7779 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
7780 exception.InstanceNotReady,
7781 exception.InstanceNotFound,
7782 exception.ConsoleTypeUnavailable,
7783 NotImplementedError)
7784 @wrap_exception()
7785 @wrap_instance_fault
7786 def get_vnc_console(self, context, console_type, instance):
7787 """Return connection information for a vnc console."""
7788 context = context.elevated()
7789 LOG.debug("Getting vnc console", instance=instance)
7791 if not CONF.vnc.enabled:
7792 raise exception.ConsoleTypeUnavailable(console_type=console_type)
7794 if console_type == 'novnc':
7795 # For essex, novncproxy_base_url must include the full path
7796 # including the html file (like http://myhost/vnc_auto.html)
7797 access_url_base = CONF.vnc.novncproxy_base_url
7798 else:
7799 raise exception.ConsoleTypeInvalid(console_type=console_type)
7801 try:
7802 # Retrieve connect info from driver, and then decorate with our
7803 # access info token
7804 console = self.driver.get_vnc_console(context, instance)
7805 console_auth = objects.ConsoleAuthToken(
7806 context=context,
7807 console_type=console_type,
7808 host=console.host,
7809 port=console.port,
7810 internal_access_path=console.internal_access_path,
7811 instance_uuid=instance.uuid,
7812 access_url_base=access_url_base,
7813 )
7814 console_auth.authorize(CONF.consoleauth.token_ttl)
7815 connect_info = console.get_connection_info(
7816 console_auth.token, console_auth.access_url)
7818 except exception.InstanceNotFound:
7819 if instance.vm_state != vm_states.BUILDING: 7819 ↛ 7820line 7819 didn't jump to line 7820 because the condition on line 7819 was never true
7820 raise
7821 raise exception.InstanceNotReady(instance_id=instance.uuid)
7823 return connect_info
7825 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
7826 exception.InstanceNotReady,
7827 exception.InstanceNotFound,
7828 exception.ConsoleTypeUnavailable,
7829 NotImplementedError)
7830 @wrap_exception()
7831 @wrap_instance_fault
7832 def get_spice_console(self, context, console_type, instance):
7833 """Return connection information for a spice console."""
7834 context = context.elevated()
7835 LOG.debug("Getting spice console", instance=instance)
7837 if not CONF.spice.enabled:
7838 raise exception.ConsoleTypeUnavailable(console_type=console_type)
7840 if console_type not in ['spice-html5', 'spice-direct']:
7841 raise exception.ConsoleTypeInvalid(console_type=console_type)
7843 try:
7844 # Retrieve connect info from driver, and then decorate with our
7845 # access info token
7846 console = self.driver.get_spice_console(context, instance)
7847 fields = {
7848 'context': context,
7849 'console_type': console_type,
7850 'host': console.host,
7851 'port': console.port,
7852 'tls_port': console.tlsPort,
7853 'instance_uuid': instance.uuid
7854 }
7855 if console_type == 'spice-html5': 7855 ↛ 7858line 7855 didn't jump to line 7858 because the condition on line 7855 was always true
7856 fields['internal_access_path'] = console.internal_access_path
7857 fields['access_url_base'] = CONF.spice.html5proxy_base_url
7858 if console_type == 'spice-direct': 7858 ↛ 7859line 7858 didn't jump to line 7859 because the condition on line 7858 was never true
7859 fields['internal_access_path'] = None
7860 fields['access_url_base'] = \
7861 CONF.spice.spice_direct_proxy_base_url
7863 console_auth = objects.ConsoleAuthToken(**fields)
7864 console_auth.authorize(CONF.consoleauth.token_ttl)
7865 connect_info = console.get_connection_info(
7866 console_auth.token, console_auth.access_url)
7868 except exception.InstanceNotFound:
7869 if instance.vm_state != vm_states.BUILDING: 7869 ↛ 7870line 7869 didn't jump to line 7870 because the condition on line 7869 was never true
7870 raise
7871 raise exception.InstanceNotReady(instance_id=instance.uuid)
7873 return connect_info
7875 # TODO(gmann): HyperV virt driver has been removed in Nova 29.0.0
7876 # but we need to keep this method to avoid RPC error in case of using
7877 # old controller with new compute. This can be removed in RPC API 7.0
7878 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
7879 exception.InstanceNotReady,
7880 exception.InstanceNotFound,
7881 exception.ConsoleTypeUnavailable,
7882 NotImplementedError)
7883 @wrap_exception()
7884 @wrap_instance_fault
7885 def get_rdp_console(self, context, console_type, instance):
7886 """Return connection information for a RDP console."""
7888 msg = ("RDP console is applicable for HyperV virt driver only which "
7889 "has been removed in Nova 29.0.0")
7890 raise NotImplementedError(msg)
7892 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
7893 exception.InstanceNotReady,
7894 exception.InstanceNotFound,
7895 exception.ConsoleTypeUnavailable,
7896 NotImplementedError)
7897 @wrap_exception()
7898 @wrap_instance_fault
7899 def get_mks_console(self, context, console_type, instance):
7900 """Return connection information for a MKS console."""
7901 context = context.elevated()
7902 LOG.debug("Getting MKS console", instance=instance)
7904 if not CONF.mks.enabled: 7904 ↛ 7905line 7904 didn't jump to line 7905 because the condition on line 7904 was never true
7905 raise exception.ConsoleTypeUnavailable(console_type=console_type)
7907 if console_type != 'webmks': 7907 ↛ 7908line 7907 didn't jump to line 7908 because the condition on line 7907 was never true
7908 raise exception.ConsoleTypeInvalid(console_type=console_type)
7910 try:
7911 # Retrieve connect info from driver, and then decorate with our
7912 # access info token
7913 console = self.driver.get_mks_console(context, instance)
7914 console_auth = objects.ConsoleAuthToken(
7915 context=context,
7916 console_type=console_type,
7917 host=console.host,
7918 port=console.port,
7919 internal_access_path=console.internal_access_path,
7920 instance_uuid=instance.uuid,
7921 access_url_base=CONF.mks.mksproxy_base_url,
7922 )
7923 console_auth.authorize(CONF.consoleauth.token_ttl)
7924 connect_info = console.get_connection_info(
7925 console_auth.token, console_auth.access_url)
7927 except exception.InstanceNotFound:
7928 if instance.vm_state != vm_states.BUILDING:
7929 raise
7930 raise exception.InstanceNotReady(instance_id=instance.uuid)
7932 return connect_info
7934 @messaging.expected_exceptions(
7935 exception.ConsoleTypeInvalid,
7936 exception.InstanceNotReady,
7937 exception.InstanceNotFound,
7938 exception.ConsoleTypeUnavailable,
7939 exception.SocketPortRangeExhaustedException,
7940 exception.ImageSerialPortNumberInvalid,
7941 exception.ImageSerialPortNumberExceedFlavorValue,
7942 NotImplementedError)
7943 @wrap_exception()
7944 @wrap_instance_fault
7945 def get_serial_console(self, context, console_type, instance):
7946 """Returns connection information for a serial console."""
7948 LOG.debug("Getting serial console", instance=instance)
7950 if not CONF.serial_console.enabled: 7950 ↛ 7951line 7950 didn't jump to line 7951 because the condition on line 7950 was never true
7951 raise exception.ConsoleTypeUnavailable(console_type=console_type)
7953 context = context.elevated()
7955 try:
7956 # Retrieve connect info from driver, and then decorate with our
7957 # access info token
7958 console = self.driver.get_serial_console(context, instance)
7959 console_auth = objects.ConsoleAuthToken(
7960 context=context,
7961 console_type=console_type,
7962 host=console.host,
7963 port=console.port,
7964 internal_access_path=console.internal_access_path,
7965 instance_uuid=instance.uuid,
7966 access_url_base=CONF.serial_console.base_url,
7967 )
7968 console_auth.authorize(CONF.consoleauth.token_ttl)
7969 connect_info = console.get_connection_info(
7970 console_auth.token, console_auth.access_url)
7972 except exception.InstanceNotFound:
7973 if instance.vm_state != vm_states.BUILDING:
7974 raise
7975 raise exception.InstanceNotReady(instance_id=instance.uuid)
7977 return connect_info
7979 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
7980 exception.InstanceNotReady,
7981 exception.InstanceNotFound)
7982 @wrap_exception()
7983 @wrap_instance_fault
7984 def validate_console_port(self, ctxt, instance, port, console_type):
7985 if console_type in ["spice-html5", "spice-direct"]:
7986 console_info = self.driver.get_spice_console(ctxt, instance)
7987 elif console_type == "serial":
7988 console_info = self.driver.get_serial_console(ctxt, instance)
7989 elif console_type == "webmks":
7990 console_info = self.driver.get_mks_console(ctxt, instance)
7991 else:
7992 console_info = self.driver.get_vnc_console(ctxt, instance)
7994 # Some drivers may return an int on console_info.port but the port
7995 # variable in this method is a string, so cast to be sure we are
7996 # comparing the correct types.
7997 return str(console_info.port) == port
7999 @wrap_exception()
8000 @reverts_task_state
8001 @wrap_instance_fault
8002 def reserve_block_device_name(self, context, instance, device,
8003 volume_id, disk_bus, device_type, tag,
8004 multiattach):
8005 if (tag and not
8006 self.driver.capabilities.get('supports_tagged_attach_volume',
8007 False)):
8008 raise exception.VolumeTaggedAttachNotSupported()
8010 if (multiattach and not
8011 self.driver.capabilities.get('supports_multiattach', False)):
8012 raise exception.MultiattachNotSupportedByVirtDriver(
8013 volume_id=volume_id)
8015 @utils.synchronized(instance.uuid)
8016 def do_reserve():
8017 bdms = (
8018 objects.BlockDeviceMappingList.get_by_instance_uuid(
8019 context, instance.uuid))
8021 # Now that we have the lock check that we haven't raced another
8022 # request and ensure there is no existing attachment
8023 if any(b for b in bdms if b.volume_id == volume_id):
8024 msg = _("volume %s already attached") % volume_id
8025 raise exception.InvalidVolume(reason=msg)
8027 # NOTE(ndipanov): We need to explicitly set all the fields on the
8028 # object so that obj_load_attr does not fail
8029 new_bdm = objects.BlockDeviceMapping(
8030 context=context,
8031 source_type='volume', destination_type='volume',
8032 instance_uuid=instance.uuid, boot_index=None,
8033 volume_id=volume_id,
8034 device_name=device, guest_format=None,
8035 disk_bus=disk_bus, device_type=device_type, tag=tag)
8037 new_bdm.device_name = self._get_device_name_for_instance(
8038 instance, bdms, new_bdm)
8040 # NOTE(vish): create bdm here to avoid race condition
8041 new_bdm.create()
8042 return new_bdm
8044 return do_reserve()
8046 @wrap_exception()
8047 @wrap_instance_event(prefix='compute')
8048 @wrap_instance_fault
8049 def attach_volume(self, context, instance, bdm):
8050 """Attach a volume to an instance."""
8051 driver_bdm = driver_block_device.convert_volume(bdm)
8053 @utils.synchronized(instance.uuid)
8054 def do_attach_volume(context, instance, driver_bdm):
8055 try:
8056 return self._attach_volume(context, instance, driver_bdm)
8057 except Exception:
8058 with excutils.save_and_reraise_exception():
8059 bdm.destroy()
8061 do_attach_volume(context, instance, driver_bdm)
8063 def _attach_volume(self, context, instance, bdm):
8064 context = context.elevated()
8065 LOG.info('Attaching volume %(volume_id)s to %(mountpoint)s',
8066 {'volume_id': bdm.volume_id,
8067 'mountpoint': bdm['mount_device']},
8068 instance=instance)
8069 compute_utils.notify_about_volume_attach_detach(
8070 context, instance, self.host,
8071 action=fields.NotificationAction.VOLUME_ATTACH,
8072 phase=fields.NotificationPhase.START,
8073 volume_id=bdm.volume_id)
8074 try:
8075 bdm.attach(context, instance, self.volume_api, self.driver,
8076 do_driver_attach=True)
8077 except Exception as e:
8078 with excutils.save_and_reraise_exception():
8079 LOG.exception("Failed to attach %(volume_id)s "
8080 "at %(mountpoint)s",
8081 {'volume_id': bdm.volume_id,
8082 'mountpoint': bdm['mount_device']},
8083 instance=instance)
8084 if bdm['attachment_id']:
8085 # Try to delete the attachment to make the volume
8086 # available again. Note that DriverVolumeBlockDevice
8087 # may have already deleted the attachment so ignore
8088 # VolumeAttachmentNotFound.
8089 try:
8090 self.volume_api.attachment_delete(
8091 context, bdm['attachment_id'])
8092 except exception.VolumeAttachmentNotFound as exc:
8093 LOG.debug('Ignoring VolumeAttachmentNotFound: %s',
8094 exc, instance=instance)
8095 else:
8096 self.volume_api.unreserve_volume(context, bdm.volume_id)
8097 compute_utils.notify_about_volume_attach_detach(
8098 context, instance, self.host,
8099 action=fields.NotificationAction.VOLUME_ATTACH,
8100 phase=fields.NotificationPhase.ERROR,
8101 exception=e,
8102 volume_id=bdm.volume_id)
8104 info = {'volume_id': bdm.volume_id}
8105 self._notify_about_instance_usage(
8106 context, instance, "volume.attach", extra_usage_info=info)
8107 compute_utils.notify_about_volume_attach_detach(
8108 context, instance, self.host,
8109 action=fields.NotificationAction.VOLUME_ATTACH,
8110 phase=fields.NotificationPhase.END,
8111 volume_id=bdm.volume_id)
8113 def _notify_volume_usage_detach(self, context, instance, bdm):
8114 if CONF.volume_usage_poll_interval <= 0:
8115 return
8117 mp = bdm.device_name
8118 # Handle bootable volumes which will not contain /dev/
8119 if '/dev/' in mp: 8119 ↛ 8121line 8119 didn't jump to line 8121 because the condition on line 8119 was always true
8120 mp = mp[5:]
8121 try:
8122 vol_stats = self.driver.block_stats(instance, mp)
8123 if vol_stats is None:
8124 return
8125 except NotImplementedError:
8126 return
8128 LOG.debug("Updating volume usage cache with totals", instance=instance)
8129 rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats
8130 vol_usage = objects.VolumeUsage(context)
8131 vol_usage.volume_id = bdm.volume_id
8132 vol_usage.instance_uuid = instance.uuid
8133 vol_usage.project_id = instance.project_id
8134 vol_usage.user_id = instance.user_id
8135 vol_usage.availability_zone = instance.availability_zone
8136 vol_usage.curr_reads = rd_req
8137 vol_usage.curr_read_bytes = rd_bytes
8138 vol_usage.curr_writes = wr_req
8139 vol_usage.curr_write_bytes = wr_bytes
8140 vol_usage.save(update_totals=True)
8141 self.notifier.info(context, 'volume.usage', vol_usage.to_dict())
8142 compute_utils.notify_about_volume_usage(context, vol_usage, self.host)
8144 def _detach_volume(self, context, bdm, instance, destroy_bdm=True,
8145 attachment_id=None):
8146 """Detach a volume from an instance.
8148 :param context: security context
8149 :param bdm: nova.objects.BlockDeviceMapping volume bdm to detach
8150 :param instance: the Instance object to detach the volume from
8151 :param destroy_bdm: if True, the corresponding BDM entry will be marked
8152 as deleted. Disabling this is useful for operations
8153 like rebuild, when we don't want to destroy BDM
8154 :param attachment_id: The volume attachment_id for the given instance
8155 and volume.
8156 """
8157 volume_id = bdm.volume_id
8158 compute_utils.notify_about_volume_attach_detach(
8159 context, instance, self.host,
8160 action=fields.NotificationAction.VOLUME_DETACH,
8161 phase=fields.NotificationPhase.START,
8162 volume_id=volume_id)
8164 self._notify_volume_usage_detach(context, instance, bdm)
8166 LOG.info('Detaching volume %(volume_id)s',
8167 {'volume_id': volume_id}, instance=instance)
8169 driver_bdm = driver_block_device.convert_volume(bdm)
8170 driver_bdm.detach(context, instance, self.volume_api, self.driver,
8171 attachment_id=attachment_id, destroy_bdm=destroy_bdm)
8173 info = dict(volume_id=volume_id)
8174 self._notify_about_instance_usage(
8175 context, instance, "volume.detach", extra_usage_info=info)
8176 compute_utils.notify_about_volume_attach_detach(
8177 context, instance, self.host,
8178 action=fields.NotificationAction.VOLUME_DETACH,
8179 phase=fields.NotificationPhase.END,
8180 volume_id=volume_id)
8182 if 'tag' in bdm and bdm.tag:
8183 self._delete_disk_metadata(instance, bdm)
8184 if destroy_bdm:
8185 bdm.destroy()
8187 def _delete_disk_metadata(self, instance, bdm):
8188 for device in instance.device_metadata.devices:
8189 if isinstance(device, objects.DiskMetadata): 8189 ↛ 8188line 8189 didn't jump to line 8188 because the condition on line 8189 was always true
8190 if 'serial' in device:
8191 if device.serial == bdm.volume_id: 8191 ↛ 8188line 8191 didn't jump to line 8188 because the condition on line 8191 was always true
8192 instance.device_metadata.devices.remove(device)
8193 instance.save()
8194 break
8195 else:
8196 # NOTE(artom) We log the entire device object because all
8197 # fields are nullable and may not be set
8198 LOG.warning('Unable to determine whether to clean up '
8199 'device metadata for disk %s', device,
8200 instance=instance)
8202 @wrap_exception()
8203 @wrap_instance_event(prefix='compute')
8204 @wrap_instance_fault
8205 def detach_volume(self, context, volume_id, instance, attachment_id):
8206 """Detach a volume from an instance.
8208 :param context: security context
8209 :param volume_id: the volume id
8210 :param instance: the Instance object to detach the volume from
8211 :param attachment_id: The volume attachment_id for the given instance
8212 and volume.
8214 """
8215 @utils.synchronized(instance.uuid)
8216 def do_detach_volume(context, volume_id, instance, attachment_id):
8217 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
8218 context, volume_id, instance.uuid)
8219 self._detach_volume(context, bdm, instance,
8220 attachment_id=attachment_id)
8222 do_detach_volume(context, volume_id, instance, attachment_id)
8224 def _init_volume_connection(self, context, new_volume,
8225 old_volume_id, connector, bdm,
8226 new_attachment_id, mountpoint):
8227 new_volume_id = new_volume['id']
8228 if new_attachment_id is None:
8229 # We're dealing with an old-style attachment so initialize the
8230 # connection so we can get the connection_info.
8231 new_cinfo = self.volume_api.initialize_connection(context,
8232 new_volume_id,
8233 connector)
8234 else:
8235 # Check for multiattach on the new volume and if True, check to
8236 # see if the virt driver supports multiattach.
8237 # TODO(mriedem): This is copied from DriverVolumeBlockDevice
8238 # and should be consolidated into some common code at some point.
8239 vol_multiattach = new_volume.get('multiattach', False)
8240 virt_multiattach = self.driver.capabilities.get(
8241 'supports_multiattach', False)
8242 if vol_multiattach and not virt_multiattach:
8243 raise exception.MultiattachNotSupportedByVirtDriver(
8244 volume_id=new_volume_id)
8246 # This is a new style attachment and the API created the new
8247 # volume attachment and passed the id to the compute over RPC.
8248 # At this point we need to update the new volume attachment with
8249 # the host connector, which will give us back the new attachment
8250 # connection_info.
8251 new_cinfo = self.volume_api.attachment_update(
8252 context, new_attachment_id, connector,
8253 mountpoint)['connection_info']
8255 if vol_multiattach:
8256 # This will be used by the volume driver to determine the
8257 # proper disk configuration.
8258 new_cinfo['multiattach'] = True
8260 old_cinfo = jsonutils.loads(bdm['connection_info'])
8261 if old_cinfo and 'serial' not in old_cinfo: 8261 ↛ 8266line 8261 didn't jump to line 8266 because the condition on line 8261 was always true
8262 old_cinfo['serial'] = old_volume_id
8263 # NOTE(lyarwood): serial is not always present in the returned
8264 # connection_info so set it if it is missing as we do in
8265 # DriverVolumeBlockDevice.attach().
8266 if 'serial' not in new_cinfo: 8266 ↛ 8268line 8266 didn't jump to line 8268 because the condition on line 8266 was always true
8267 new_cinfo['serial'] = new_volume_id
8268 return (old_cinfo, new_cinfo)
8270 def _swap_volume(self, context, instance, bdm, connector,
8271 old_volume_id, new_volume, resize_to,
8272 new_attachment_id, is_cinder_migration):
8273 new_volume_id = new_volume['id']
8274 mountpoint = bdm['device_name']
8275 failed = False
8276 new_cinfo = None
8277 try:
8278 old_cinfo, new_cinfo = self._init_volume_connection(
8279 context, new_volume, old_volume_id, connector,
8280 bdm, new_attachment_id, mountpoint)
8281 # NOTE(lyarwood): The Libvirt driver, the only virt driver
8282 # currently implementing swap_volume, will modify the contents of
8283 # new_cinfo when connect_volume is called. This is then saved to
8284 # the BDM in swap_volume for future use outside of this flow.
8285 msg = ("swap_volume: Calling driver volume swap with "
8286 "connection infos: new: %(new_cinfo)s; "
8287 "old: %(old_cinfo)s" %
8288 {'new_cinfo': new_cinfo, 'old_cinfo': old_cinfo})
8289 # Both new and old info might contain password
8290 LOG.debug(strutils.mask_password(msg), instance=instance)
8292 self.driver.swap_volume(context, old_cinfo, new_cinfo, instance,
8293 mountpoint, resize_to)
8294 if new_attachment_id:
8295 self.volume_api.attachment_complete(context, new_attachment_id)
8296 msg = ("swap_volume: Driver volume swap returned, new "
8297 "connection_info is now : %(new_cinfo)s" %
8298 {'new_cinfo': new_cinfo})
8299 LOG.debug(strutils.mask_password(msg))
8300 except Exception as ex:
8301 failed = True
8302 with excutils.save_and_reraise_exception():
8303 compute_utils.notify_about_volume_swap(
8304 context, instance, self.host,
8305 fields.NotificationPhase.ERROR,
8306 old_volume_id, new_volume_id, ex)
8307 if new_cinfo:
8308 msg = ("Failed to swap volume %(old_volume_id)s "
8309 "for %(new_volume_id)s")
8310 LOG.exception(msg, {'old_volume_id': old_volume_id,
8311 'new_volume_id': new_volume_id},
8312 instance=instance)
8313 else:
8314 msg = ("Failed to connect to volume %(volume_id)s "
8315 "with volume at %(mountpoint)s")
8316 LOG.exception(msg, {'volume_id': new_volume_id,
8317 'mountpoint': bdm['device_name']},
8318 instance=instance)
8320 # The API marked the volume as 'detaching' for the old volume
8321 # so we need to roll that back so the volume goes back to
8322 # 'in-use' state.
8323 self.volume_api.roll_detaching(context, old_volume_id)
8325 if new_attachment_id is None:
8326 # The API reserved the new volume so it would be in
8327 # 'attaching' status, so we need to unreserve it so it
8328 # goes back to 'available' status.
8329 self.volume_api.unreserve_volume(context, new_volume_id)
8330 else:
8331 # This is a new style attachment for the new volume, which
8332 # was created in the API. We just need to delete it here
8333 # to put the new volume back into 'available' status.
8334 self.volume_api.attachment_delete(
8335 context, new_attachment_id)
8336 finally:
8337 # TODO(mriedem): This finally block is terribly confusing and is
8338 # trying to do too much. We should consider removing the finally
8339 # block and move whatever needs to happen on success and failure
8340 # into the blocks above for clarity, even if it means a bit of
8341 # redundant code.
8342 conn_volume = new_volume_id if failed else old_volume_id
8343 if new_cinfo:
8344 LOG.debug("swap_volume: removing Cinder connection "
8345 "for volume %(volume)s", {'volume': conn_volume},
8346 instance=instance)
8347 if bdm.attachment_id is None:
8348 # This is the pre-3.44 flow for new-style volume
8349 # attachments so just terminate the connection.
8350 self.volume_api.terminate_connection(context,
8351 conn_volume,
8352 connector)
8353 else:
8354 # This is a new style volume attachment. If we failed, then
8355 # the new attachment was already deleted above in the
8356 # exception block and we have nothing more to do here. If
8357 # swap_volume was successful in the driver, then we need to
8358 # "detach" the original attachment by deleting it.
8359 if not failed:
8360 self.volume_api.attachment_delete(
8361 context, bdm.attachment_id)
8363 # Need to make some decisions based on whether this was
8364 # a Cinder initiated migration or not. The callback to
8365 # migration completion isn't needed in the case of a
8366 # nova initiated simple swap of two volume
8367 # "volume-update" call so skip that. The new attachment
8368 # scenarios will give us a new attachment record and
8369 # that's what we want.
8370 if bdm.attachment_id and not is_cinder_migration:
8371 # we don't callback to cinder
8372 comp_ret = {'save_volume_id': new_volume_id}
8373 else:
8374 # NOTE(lyarwood): The following call to
8375 # os-migrate-volume-completion returns a dict containing
8376 # save_volume_id, this volume id has two possible values :
8377 # 1. old_volume_id if we are migrating (retyping) volumes
8378 # 2. new_volume_id if we are swapping between two existing
8379 # volumes
8380 # This volume id is later used to update the volume_id and
8381 # connection_info['serial'] of the BDM.
8382 comp_ret = self.volume_api.migrate_volume_completion(
8383 context,
8384 old_volume_id,
8385 new_volume_id,
8386 error=failed)
8387 LOG.debug("swap_volume: Cinder migrate_volume_completion "
8388 "returned: %(comp_ret)s", {'comp_ret': comp_ret},
8389 instance=instance)
8391 return (comp_ret, new_cinfo)
8393 @wrap_exception()
8394 @wrap_instance_event(prefix='compute')
8395 @wrap_instance_fault
8396 def swap_volume(self, context, old_volume_id, new_volume_id, instance,
8397 new_attachment_id):
8398 """Replace the old volume with the new volume within the active server
8400 :param context: User request context
8401 :param old_volume_id: Original volume id
8402 :param new_volume_id: New volume id being swapped to
8403 :param instance: Instance with original_volume_id attached
8404 :param new_attachment_id: ID of the new attachment for new_volume_id
8405 """
8406 @utils.synchronized(instance.uuid)
8407 def _do_locked_swap_volume(context, old_volume_id, new_volume_id,
8408 instance, new_attachment_id):
8409 self._do_swap_volume(context, old_volume_id, new_volume_id,
8410 instance, new_attachment_id)
8411 _do_locked_swap_volume(context, old_volume_id, new_volume_id, instance,
8412 new_attachment_id)
8414 def _do_swap_volume(self, context, old_volume_id, new_volume_id,
8415 instance, new_attachment_id):
8416 """Replace the old volume with the new volume within the active server
8418 :param context: User request context
8419 :param old_volume_id: Original volume id
8420 :param new_volume_id: New volume id being swapped to
8421 :param instance: Instance with original_volume_id attached
8422 :param new_attachment_id: ID of the new attachment for new_volume_id
8423 """
8424 context = context.elevated()
8425 compute_utils.notify_about_volume_swap(
8426 context, instance, self.host,
8427 fields.NotificationPhase.START,
8428 old_volume_id, new_volume_id)
8430 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
8431 context, old_volume_id, instance.uuid)
8432 connector = self.driver.get_volume_connector(instance)
8434 resize_to = 0
8435 old_volume = self.volume_api.get(context, old_volume_id)
8436 # Yes this is a tightly-coupled state check of what's going on inside
8437 # cinder, but we need this while we still support old (v1/v2) and
8438 # new style attachments (v3.44). Once we drop support for old style
8439 # attachments we could think about cleaning up the cinder-initiated
8440 # swap volume API flows.
8441 is_cinder_migration = False
8442 if 'migration_status' in old_volume:
8443 is_cinder_migration = old_volume['migration_status'] == 'migrating'
8444 old_vol_size = old_volume['size']
8445 new_volume = self.volume_api.get(context, new_volume_id)
8446 new_vol_size = new_volume['size']
8447 if new_vol_size > old_vol_size:
8448 resize_to = new_vol_size
8450 LOG.info('Swapping volume %(old_volume)s for %(new_volume)s',
8451 {'old_volume': old_volume_id, 'new_volume': new_volume_id},
8452 instance=instance)
8453 comp_ret, new_cinfo = self._swap_volume(context,
8454 instance,
8455 bdm,
8456 connector,
8457 old_volume_id,
8458 new_volume,
8459 resize_to,
8460 new_attachment_id,
8461 is_cinder_migration)
8463 # NOTE(lyarwood): Update the BDM with the modified new_cinfo and
8464 # correct volume_id returned by Cinder.
8465 save_volume_id = comp_ret['save_volume_id']
8467 # NOTE(lyarwood): Overwrite the possibly stale serial and volume_id in
8468 # the connection_info with the volume_id returned from Cinder. This
8469 # could be the case during a volume migration where the new_cinfo here
8470 # refers to the temporary volume *before* Cinder renames it to the
8471 # original volume UUID at the end of the migration.
8472 new_cinfo['serial'] = save_volume_id
8473 new_cinfo['volume_id'] = save_volume_id
8474 if 'data' in new_cinfo:
8475 new_cinfo['data']['volume_id'] = save_volume_id
8476 values = {
8477 'connection_info': jsonutils.dumps(new_cinfo),
8478 'source_type': 'volume',
8479 'destination_type': 'volume',
8480 'snapshot_id': None,
8481 'volume_id': save_volume_id,
8482 'no_device': None}
8484 if resize_to:
8485 values['volume_size'] = resize_to
8487 if new_attachment_id is not None:
8488 # This was a volume swap for a new-style attachment so we
8489 # need to update the BDM attachment_id for the new attachment.
8490 values['attachment_id'] = new_attachment_id
8492 LOG.debug("swap_volume: Updating volume %(volume_id)s BDM record with "
8493 "%(updates)s", {'volume_id': bdm.volume_id,
8494 'updates': values},
8495 instance=instance)
8496 bdm.update(values)
8497 bdm.save()
8499 compute_utils.notify_about_volume_swap(
8500 context, instance, self.host,
8501 fields.NotificationPhase.END,
8502 old_volume_id, new_volume_id)
8504 @wrap_exception()
8505 def remove_volume_connection(
8506 self, context, volume_id, instance,
8507 delete_attachment=False):
8508 """Remove the volume connection on this host
8510 Detach the volume from this instance on this host, and if this is
8511 the cinder v2 flow, call cinder to terminate the connection.
8512 """
8513 try:
8514 # NOTE(mriedem): If the BDM was just passed directly we would not
8515 # need to do this DB query, but this is an RPC interface so
8516 # changing that requires some care.
8517 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
8518 context, volume_id, instance.uuid)
8519 # NOTE(mriedem): Normally we would pass delete_attachment=True to
8520 # _remove_volume_connection to delete a v3 style volume attachment,
8521 # but this method is RPC called from _rollback_live_migration which
8522 # already deletes the attachment, so because of that tight coupling
8523 # we cannot simply delete a v3 style attachment here without
8524 # needing to do some behavior modification of that
8525 # _rollback_live_migration flow which gets messy.
8526 self._remove_volume_connection(
8527 context, bdm, instance, delete_attachment)
8528 except exception.NotFound:
8529 pass
8531 def _remove_volume_connection(self, context, bdm, instance,
8532 delete_attachment=False):
8533 """Remove the volume connection on this host
8535 Detach the volume from this instance on this host.
8537 :param context: nova auth request context
8538 :param bdm: BlockDeviceMapping object for a volume attached to the
8539 instance
8540 :param instance: Instance object with a volume attached represented
8541 by ``bdm``
8542 :param delete_attachment: If ``bdm.attachment_id`` is not None the
8543 attachment was made as a cinder v3 style attachment and if True,
8544 then deletes the volume attachment, otherwise just terminates
8545 the connection for a cinder legacy style connection.
8546 """
8547 driver_bdm = driver_block_device.convert_volume(bdm)
8548 driver_bdm.driver_detach(context, instance,
8549 self.volume_api, self.driver)
8550 if bdm.attachment_id is None:
8551 # cinder v2 api flow
8552 connector = self.driver.get_volume_connector(instance)
8553 self.volume_api.terminate_connection(context, bdm.volume_id,
8554 connector)
8555 elif delete_attachment:
8556 # cinder v3 api flow
8557 self.volume_api.attachment_delete(context, bdm.attachment_id)
8559 def _deallocate_port_resource_for_instance(
8560 self,
8561 context: nova.context.RequestContext,
8562 instance: 'objects.Instance',
8563 port_id: str,
8564 port_allocation: ty.Dict[str, ty.Dict[str, ty.Dict[str, int]]],
8565 ) -> None:
8567 if not port_allocation:
8568 return
8570 try:
8571 client = self.reportclient
8572 client.remove_resources_from_instance_allocation(
8573 context, instance.uuid, port_allocation)
8574 except Exception as ex:
8575 # We always raise here as it is not a race condition where
8576 # somebody has already deleted the port we want to cleanup.
8577 # Here we see that the port exists, the allocation exists,
8578 # but we cannot clean it up so we will actually leak
8579 # allocations.
8580 with excutils.save_and_reraise_exception():
8581 LOG.warning(
8582 'Failed to remove resource allocation of port %(port_id)s '
8583 'for instance. Error: %(error)s',
8584 {'port_id': port_id, 'error': ex},
8585 instance=instance)
8587 def _deallocate_port_for_instance(
8588 self, context, instance, port_id, raise_on_failure=False,
8589 pci_device=None):
8590 try:
8591 result = self.network_api.deallocate_port_for_instance(
8592 context, instance, port_id)
8593 __, port_allocation = result
8594 except Exception as ex:
8595 with excutils.save_and_reraise_exception(
8596 reraise=raise_on_failure):
8597 LOG.warning('Failed to deallocate port %(port_id)s '
8598 'for instance. Error: %(error)s',
8599 {'port_id': port_id, 'error': ex},
8600 instance=instance)
8601 else:
8602 if pci_device: 8602 ↛ 8608line 8602 didn't jump to line 8608 because the condition on line 8602 was always true
8603 self.rt.unclaim_pci_devices(context, pci_device, instance)
8604 instance.remove_pci_device_and_request(pci_device)
8606 # Deallocate the resources in placement that were used by the
8607 # detached port.
8608 self._deallocate_port_resource_for_instance(
8609 context, instance, port_id, port_allocation)
8611 def _claim_pci_device_for_interface_attach(
8612 self,
8613 context: nova.context.RequestContext,
8614 instance: 'objects.Instance',
8615 pci_reqs: 'objects.InstancePCIRequests',
8616 ) -> ty.Optional['objects.PciDevice']:
8617 """Claim PCI devices if there are PCI requests
8619 :param context: nova.context.RequestContext
8620 :param instance: the objects.Instance to where the interface is being
8621 attached
8622 :param pci_reqs: A InstancePCIRequests object describing the
8623 needed PCI devices
8624 :raises InterfaceAttachPciClaimFailed: if the PCI device claim fails
8625 :returns: An objects.PciDevice describing the claimed PCI device for
8626 the interface or None if no device is requested
8627 """
8629 if not pci_reqs.requests: 8629 ↛ 8630line 8629 didn't jump to line 8630 because the condition on line 8629 was never true
8630 return None
8632 try:
8633 devices = self.rt.claim_pci_devices(
8634 context, pci_reqs, instance.numa_topology)
8635 except exception.PciDeviceRequestFailed:
8636 LOG.info('Failed to claim PCI devices during interface attach '
8637 'for PCI request %s', pci_reqs, instance=instance)
8638 raise exception.InterfaceAttachPciClaimFailed(
8639 instance_uuid=instance.uuid)
8641 # NOTE(gibi): We assume that maximum one PCI devices is attached per
8642 # interface attach request.
8643 device = devices[0]
8644 instance.pci_devices.objects.append(device)
8646 return device
8648 def _allocate_port_resource_for_instance(
8649 self,
8650 context: nova.context.RequestContext,
8651 instance: 'objects.Instance',
8652 pci_reqs: 'objects.InstancePCIRequests',
8653 request_groups: ty.List['objects.RequestGroup'],
8654 request_level_params: 'objects.RequestLevelParams',
8655 ) -> ty.Tuple[ty.Optional[ty.Dict[str, ty.List[str]]],
8656 ty.Optional[ty.Dict[str, ty.Dict[str, ty.Dict[str, int]]]]]:
8657 """Allocate resources for the request in placement
8659 :param context: nova.context.RequestContext
8660 :param instance: the objects.Instance to where the interface is being
8661 attached
8662 :param pci_reqs: A list of InstancePCIRequest objects describing the
8663 needed PCI devices
8664 :param request_groups: A list of RequestGroup objects describing the
8665 resources the port requests from placement
8666 :param request_level_params: A RequestLevelParams object describing the
8667 non group specific request of the port.
8668 :raises InterfaceAttachResourceAllocationFailed: if we failed to
8669 allocate resource in placement for the request
8670 :returns: A tuple of provider mappings and allocated resources or
8671 (None, None) if no resource allocation was needed for the request
8672 """
8674 if not request_groups:
8675 return None, None
8677 # restrict the resource request to the current compute node. The
8678 # compute node uuid is the uuid of the root provider of the node in
8679 # placement
8680 compute_node_uuid = objects.ComputeNode.get_by_nodename(
8681 context, instance.node).uuid
8682 # we can have multiple request groups, it would be enough to restrict
8683 # only one of them to the compute tree but for symmetry we restrict
8684 # all of them
8685 for request_group in request_groups:
8686 request_group.in_tree = compute_node_uuid
8688 # NOTE(gibi): group policy is mandatory in a resource request if there
8689 # are multiple groups. The policy can only come from the flavor today
8690 # and a new flavor is not provided with an interface attach request and
8691 # the instance's current flavor might not have a policy. Still we are
8692 # attaching a single port where currently the two possible groups
8693 # (one for bandwidth and one for packet rate) will always be allocated
8694 # from different providers. So both possible policies (none, isolated)
8695 # are always fulfilled for this single port. We still has to specify
8696 # one so we specify the least restrictive now.
8697 rr = scheduler_utils.ResourceRequest.from_request_groups(
8698 request_groups, request_level_params, group_policy='none')
8699 res = self.reportclient.get_allocation_candidates(context, rr)
8700 alloc_reqs, provider_sums, version = res
8702 if not alloc_reqs:
8703 # no allocation candidates available, we run out of free resources
8704 raise exception.InterfaceAttachResourceAllocationFailed(
8705 instance_uuid=instance.uuid)
8707 # select one of the candidates and update the instance
8708 # allocation
8709 # TODO(gibi): We could loop over all possible candidates
8710 # if the first one selected here does not work due to race or due
8711 # to not having free PCI devices. However the latter is only
8712 # detected later in the interface attach code path.
8713 alloc_req = alloc_reqs[0]
8714 resources = alloc_req['allocations']
8715 provider_mappings = alloc_req['mappings']
8716 try:
8717 self.reportclient.add_resources_to_instance_allocation(
8718 context, instance.uuid, resources)
8719 except exception.AllocationUpdateFailed as e:
8720 # We lost a race. We could retry another candidate
8721 raise exception.InterfaceAttachResourceAllocationFailed(
8722 instance_uuid=instance.uuid) from e
8723 except (
8724 exception.ConsumerAllocationRetrievalFailed,
8725 keystone_exception.ClientException,
8726 ) as e:
8727 # These are non-recoverable errors so we should not retry
8728 raise exception.InterfaceAttachResourceAllocationFailed(
8729 instance_uuid=instance.uuid) from e
8731 try:
8732 compute_utils.update_pci_request_with_placement_allocations(
8733 context,
8734 self.reportclient,
8735 pci_reqs.requests,
8736 provider_mappings,
8737 )
8738 except (
8739 exception.AmbiguousResourceProviderForPCIRequest,
8740 exception.UnexpectedResourceProviderNameForPCIRequest
8741 ):
8742 # These are programming errors. So we clean up an re-raise to let
8743 # the request fail
8744 with excutils.save_and_reraise_exception():
8745 self.reportclient.remove_resources_from_instance_allocation(
8746 context, instance.uuid, resources)
8748 return provider_mappings, resources
8750 # TODO(mriedem): There are likely race failures which can result in
8751 # NotFound and QuotaError exceptions getting traced as well.
8752 @messaging.expected_exceptions(
8753 # Do not log a traceback for user errors. We use Invalid generically
8754 # since this method can raise lots of different exceptions:
8755 # AttachInterfaceNotSupported
8756 # NetworkInterfaceTaggedAttachNotSupported
8757 # NetworkAmbiguous
8758 # PortNotUsable
8759 # PortInUse
8760 # PortNotUsableDNS
8761 # AttachSRIOVPortNotSupported
8762 # NetworksWithQoSPolicyNotSupported
8763 # InterfaceAttachResourceAllocationFailed
8764 exception.Invalid)
8765 @wrap_exception()
8766 @wrap_instance_event(prefix='compute')
8767 @wrap_instance_fault
8768 def attach_interface(self, context, instance, network_id, port_id,
8769 requested_ip, tag):
8770 """Use hotplug to add an network adapter to an instance."""
8771 lockname = 'interface-%s-%s' % (instance.uuid, port_id)
8773 @utils.synchronized(lockname)
8774 def do_attach_interface(context, instance, network_id, port_id,
8775 requested_ip, tag):
8776 return self._attach_interface(context, instance, network_id,
8777 port_id, requested_ip, tag)
8779 return do_attach_interface(context, instance, network_id, port_id,
8780 requested_ip, tag)
8782 def _attach_interface(self, context, instance, network_id, port_id,
8783 requested_ip, tag):
8784 if not self.driver.capabilities.get('supports_attach_interface', 8784 ↛ 8786line 8784 didn't jump to line 8786 because the condition on line 8784 was never true
8785 False):
8786 raise exception.AttachInterfaceNotSupported(
8787 instance_uuid=instance.uuid)
8788 if (tag and not
8789 self.driver.capabilities.get('supports_tagged_attach_interface',
8790 False)):
8791 raise exception.NetworkInterfaceTaggedAttachNotSupported()
8793 compute_utils.notify_about_instance_action(
8794 context, instance, self.host,
8795 action=fields.NotificationAction.INTERFACE_ATTACH,
8796 phase=fields.NotificationPhase.START)
8798 bind_host_id = self.driver.network_binding_host_id(context, instance)
8800 requested_networks = objects.NetworkRequestList(
8801 objects=[
8802 objects.NetworkRequest(
8803 network_id=network_id,
8804 port_id=port_id,
8805 address=requested_ip,
8806 tag=tag,
8807 )
8808 ]
8809 )
8811 if len(requested_networks) != 1: 8811 ↛ 8812line 8811 didn't jump to line 8812 because the condition on line 8811 was never true
8812 LOG.warning(
8813 "Interface attach only supports one interface per attach "
8814 "request", instance=instance)
8815 raise exception.InterfaceAttachFailed(instance_uuid=instance.uuid)
8817 pci_numa_affinity_policy = hardware.get_pci_numa_policy_constraint(
8818 instance.flavor, instance.image_meta)
8819 pci_reqs = objects.InstancePCIRequests(
8820 requests=[], instance_uuid=instance.uuid)
8821 _, request_groups, req_lvl_params = (
8822 self.network_api.create_resource_requests(
8823 context,
8824 requested_networks,
8825 pci_reqs,
8826 affinity_policy=pci_numa_affinity_policy
8827 )
8828 )
8830 result = self._allocate_port_resource_for_instance(
8831 context, instance, pci_reqs, request_groups, req_lvl_params)
8832 provider_mappings, resources = result
8834 try:
8835 pci_device = self._claim_pci_device_for_interface_attach(
8836 context, instance, pci_reqs)
8837 except exception.InterfaceAttachPciClaimFailed:
8838 with excutils.save_and_reraise_exception():
8839 if resources: 8839 ↛ 8845line 8839 didn't jump to line 8845
8840 # TODO(gibi): Instead of giving up we could try another
8841 # allocation candidate from _allocate_resources() if any
8842 self._deallocate_port_resource_for_instance(
8843 context, instance, port_id, resources)
8845 instance.pci_requests.requests.extend(pci_reqs.requests)
8847 network_info = self.network_api.allocate_for_instance(
8848 context,
8849 instance,
8850 requested_networks,
8851 bind_host_id=bind_host_id,
8852 resource_provider_mapping=provider_mappings,
8853 )
8855 if len(network_info) != 1: 8855 ↛ 8856line 8855 didn't jump to line 8856 because the condition on line 8855 was never true
8856 LOG.error('allocate_for_instance returned %(ports)s '
8857 'ports', {'ports': len(network_info)})
8858 # TODO(elod.illes): an instance.interface_attach.error notification
8859 # should be sent here
8860 raise exception.InterfaceAttachFailed(
8861 instance_uuid=instance.uuid)
8862 image_meta = objects.ImageMeta.from_instance(instance)
8864 try:
8865 self.driver.attach_interface(context, instance, image_meta,
8866 network_info[0])
8867 except exception.NovaException as ex:
8868 port_id = network_info[0].get('id')
8869 LOG.warning("attach interface failed , try to deallocate "
8870 "port %(port_id)s, reason: %(msg)s",
8871 {'port_id': port_id, 'msg': ex},
8872 instance=instance)
8873 self._deallocate_port_for_instance(
8874 context, instance, port_id, pci_device=pci_device)
8876 compute_utils.notify_about_instance_action(
8877 context, instance, self.host,
8878 action=fields.NotificationAction.INTERFACE_ATTACH,
8879 phase=fields.NotificationPhase.ERROR,
8880 exception=ex)
8882 raise exception.InterfaceAttachFailed(
8883 instance_uuid=instance.uuid)
8885 if pci_device:
8886 # NOTE(gibi): The _claim_pci_device_for_interface_attach() call
8887 # found a pci device but it only marked the device as claimed. The
8888 # periodic update_available_resource would move the device to
8889 # allocated state. But as driver.attach_interface() has been
8890 # succeeded we now know that the interface is also allocated
8891 # (used by) to the instance. So make sure the pci tracker also
8892 # tracks this device as allocated. This way we can avoid a possible
8893 # race condition when a detach arrives for a device that is only
8894 # in claimed state.
8895 self.rt.allocate_pci_devices_for_instance(context, instance)
8897 instance.save()
8899 compute_utils.notify_about_instance_action(
8900 context, instance, self.host,
8901 action=fields.NotificationAction.INTERFACE_ATTACH,
8902 phase=fields.NotificationPhase.END)
8904 return network_info[0]
8906 @wrap_exception()
8907 @wrap_instance_event(prefix='compute')
8908 @wrap_instance_fault
8909 def detach_interface(self, context, instance, port_id):
8910 """Detach a network adapter from an instance."""
8911 lockname = 'interface-%s-%s' % (instance.uuid, port_id)
8913 @utils.synchronized(lockname)
8914 def do_detach_interface(context, instance, port_id):
8915 self._detach_interface(context, instance, port_id)
8917 do_detach_interface(context, instance, port_id)
8919 def _detach_interface(self, context, instance, port_id):
8920 # NOTE(aarents): we need to refresh info cache from DB here,
8921 # as previous detach/attach lock holder just updated it.
8922 compute_utils.refresh_info_cache_for_instance(context, instance)
8923 network_info = instance.info_cache.network_info
8924 condemned = None
8925 for vif in network_info:
8926 if vif['id'] == port_id: 8926 ↛ 8925line 8926 didn't jump to line 8925 because the condition on line 8926 was always true
8927 condemned = vif
8928 break
8929 if condemned is None:
8930 raise exception.PortNotFound(_("Port %s is not "
8931 "attached") % port_id)
8933 pci_req = pci_req_module.get_instance_pci_request_from_vif(
8934 context, instance, condemned)
8936 pci_device = None
8937 if pci_req:
8938 pci_devices = [pci_device
8939 for pci_device in instance.pci_devices.objects
8940 if pci_device.request_id == pci_req.request_id]
8942 if not pci_devices:
8943 LOG.warning(
8944 "Detach interface failed, port_id=%(port_id)s, "
8945 "reason: PCI device not found for PCI request %(pci_req)s",
8946 {'port_id': port_id, 'pci_req': pci_req})
8947 raise exception.InterfaceDetachFailed(
8948 instance_uuid=instance.uuid)
8950 pci_device = pci_devices[0]
8952 compute_utils.notify_about_instance_action(
8953 context, instance, self.host,
8954 action=fields.NotificationAction.INTERFACE_DETACH,
8955 phase=fields.NotificationPhase.START)
8957 try:
8958 self.driver.detach_interface(context, instance, condemned)
8959 except exception.NovaException as ex:
8960 # If the instance was deleted before the interface was detached,
8961 # just log it at debug.
8962 log_level = (logging.DEBUG
8963 if isinstance(ex, exception.InstanceNotFound)
8964 else logging.WARNING)
8965 LOG.log(log_level,
8966 "Detach interface failed, port_id=%(port_id)s, reason: "
8967 "%(msg)s", {'port_id': port_id, 'msg': ex},
8968 instance=instance)
8969 raise exception.InterfaceDetachFailed(instance_uuid=instance.uuid)
8970 else:
8971 self._deallocate_port_for_instance(
8972 context, instance, port_id, raise_on_failure=True,
8973 pci_device=pci_device)
8975 instance.save()
8977 compute_utils.notify_about_instance_action(
8978 context, instance, self.host,
8979 action=fields.NotificationAction.INTERFACE_DETACH,
8980 phase=fields.NotificationPhase.END)
8982 def _get_compute_info(self, context, host):
8983 return objects.ComputeNode.get_first_node_by_host_for_old_compat(
8984 context, host)
8986 @wrap_exception()
8987 def check_instance_shared_storage(self, ctxt, data):
8988 """Check if the instance files are shared
8990 :param ctxt: security context
8991 :param data: result of driver.check_instance_shared_storage_local
8993 Returns True if instance disks located on shared storage and
8994 False otherwise.
8995 """
8996 return self.driver.check_instance_shared_storage_remote(ctxt, data)
8998 def _dest_can_numa_live_migrate(self, dest_check_data, migration):
8999 # TODO(artom) If we have a libvirt driver we expect it to set
9000 # dst_supports_numa_live_migration, but we have to remove it if we
9001 # did not get a migration from the conductor, indicating that it
9002 # cannot send RPC 5.3. This check can be removed in RPC 6.0.
9003 if ('dst_supports_numa_live_migration' in dest_check_data and
9004 dest_check_data.dst_supports_numa_live_migration and
9005 not migration):
9006 delattr(dest_check_data, 'dst_supports_numa_live_migration')
9007 return dest_check_data
9009 @wrap_exception()
9010 @wrap_instance_event(prefix='compute')
9011 @wrap_instance_fault
9012 def check_can_live_migrate_destination(self, ctxt, instance,
9013 block_migration, disk_over_commit,
9014 migration, limits):
9015 """Check if it is possible to execute live migration.
9017 This runs checks on the destination host, and then calls
9018 back to the source host to check the results.
9020 :param context: security context
9021 :param instance: dict of instance data
9022 :param block_migration: if true, prepare for block migration
9023 if None, calculate it in driver
9024 :param disk_over_commit: if true, allow disk over commit
9025 if None, ignore disk usage checking
9026 :param migration: objects.Migration object for this live migration.
9027 :param limits: objects.SchedulerLimits object for this live migration.
9028 :returns: a LiveMigrateData object (hypervisor-dependent)
9029 """
9031 # Error out if this host cannot accept the new instance due
9032 # to anti-affinity. This check at this moment is not very accurate, as
9033 # multiple requests may be happening concurrently and miss the lock,
9034 # but when it works it provides a better user experience by failing
9035 # earlier. Also, it should be safe to explode here, error becomes
9036 # NoValidHost and instance status remains ACTIVE.
9037 try:
9038 self._validate_instance_group_policy(ctxt, instance)
9039 except exception.RescheduledException as e:
9040 msg = ("Failed to validate instance group policy "
9041 "due to: {}".format(e))
9042 raise exception.MigrationPreCheckError(reason=msg)
9044 src_compute_info = obj_base.obj_to_primitive(
9045 self._get_compute_info(ctxt, instance.host))
9046 dst_compute_info = obj_base.obj_to_primitive(
9047 self._get_compute_info(ctxt, self.host))
9048 dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,
9049 instance, src_compute_info, dst_compute_info,
9050 block_migration, disk_over_commit)
9051 dest_check_data = self._dest_can_numa_live_migrate(dest_check_data,
9052 migration)
9053 LOG.debug('destination check data is %s', dest_check_data)
9054 try:
9055 allocs = self.reportclient.get_allocations_for_consumer(
9056 ctxt, instance.uuid)
9057 migrate_data = self.compute_rpcapi.check_can_live_migrate_source(
9058 ctxt, instance, dest_check_data)
9059 if ('src_supports_numa_live_migration' in migrate_data and
9060 migrate_data.src_supports_numa_live_migration):
9061 migrate_data = self._live_migration_claim(
9062 ctxt, instance, migrate_data, migration, limits, allocs)
9063 elif 'dst_supports_numa_live_migration' in dest_check_data: 9063 ↛ 9073line 9063 didn't jump to line 9073 because the condition on line 9063 was always true
9064 LOG.info('Destination was ready for NUMA live migration, '
9065 'but source is either too old, or is set to an '
9066 'older upgrade level.', instance=instance)
9068 # At this point, we know that this compute node (destination)
9069 # potentially has enough live-migratable PCI devices, based on the
9070 # fact that the scheduler selected this host as the destination.
9071 # The claim code below is the decisive step to move from a
9072 # potentially correct to a known to be correct destination.
9073 flavored_pci_reqs = [
9074 pci_req
9075 for pci_req in instance.pci_requests.requests
9076 if pci_req.source == objects.InstancePCIRequest.FLAVOR_ALIAS
9077 ]
9079 migrate_data.pci_dev_map_src_dst = self._flavor_based_pci_claim(
9080 ctxt, instance, flavored_pci_reqs
9081 )
9083 if self.network_api.has_port_binding_extension(ctxt):
9084 # Create migrate_data vifs if not provided by driver.
9085 if 'vifs' not in migrate_data: 9085 ↛ 9091line 9085 didn't jump to line 9091 because the condition on line 9085 was always true
9086 migrate_data.vifs = (
9087 migrate_data_obj.
9088 VIFMigrateData.create_skeleton_migrate_vifs(
9089 instance.get_network_info()))
9090 # Claim PCI devices for VIFs on destination (if needed)
9091 port_id_to_pci = self._claim_pci_for_instance_vifs(
9092 ctxt, instance)
9093 # Update migrate VIFs with the newly claimed PCI devices
9094 self._update_migrate_vifs_profile_with_pci(
9095 migrate_data.vifs, port_id_to_pci)
9096 # This should always be the last check as we persist some internal
9097 # dictionary value in the libvirt driver.
9098 migrate_data = self.driver.check_source_migrate_data_at_dest(
9099 ctxt, instance, migrate_data, migration, limits, allocs)
9100 finally:
9101 self.driver.cleanup_live_migration_destination_check(ctxt,
9102 dest_check_data)
9103 return migrate_data
9105 def _flavor_based_pci_claim(self, ctxt, instance, pci_requests):
9106 if not pci_requests:
9107 # Return an empty dict as migrate_data.pci_dev_map_src_dst
9108 # cannot be None
9109 return {}
9111 # Create an InstancePCIRequests and claim against PCI resource
9112 # tracker for out dest instance
9113 pci_requests = objects.InstancePCIRequests(
9114 requests=pci_requests,
9115 instance_uuid=instance.uuid)
9117 src_devs = objects.PciDeviceList.get_by_instance_uuid(
9118 ctxt, instance.uuid
9119 )
9121 claimed_dst_devs = self._claim_from_pci_reqs(
9122 ctxt, instance, pci_requests
9123 )
9125 pci_dev_map_src_dst = {}
9126 for req_id in (
9127 pci_req.request_id for pci_req in pci_requests.requests
9128 ):
9129 req_src_addr = [
9130 dev.address
9131 for dev in src_devs
9132 if dev.request_id == req_id
9133 ]
9134 req_claimed_dst__addr = [
9135 dev.address
9136 for dev in claimed_dst_devs
9137 if dev.request_id == req_id
9138 ]
9140 # This still depends on ordering, but only within the scope
9141 # of the same InstancePCIRequest. The only case where multiple
9142 # devices per compute node are allocated for a single request
9143 # is when req.count > 1. In that case, the allocated devices are
9144 # interchangeable since they match the same specification from
9145 # the same InstancePCIRequest.
9146 #
9147 # Therefore, this ordering dependency is acceptable.
9148 pci_dev_map_src_dst.update(dict(
9149 zip(req_src_addr, req_claimed_dst__addr)
9150 ))
9152 return pci_dev_map_src_dst
9154 def _claim_from_pci_reqs(self, ctxt, instance, pci_requests):
9155 # if we are called during the live migration with NUMA topology
9156 # support the PCI claim needs to consider the destination NUMA
9157 # topology that is then stored in the migration_context
9158 dest_topo = None
9159 if instance.migration_context: 9159 ↛ 9162line 9159 didn't jump to line 9162 because the condition on line 9159 was always true
9160 dest_topo = instance.migration_context.new_numa_topology
9162 claimed_pci_devices_objs = self.rt.claim_pci_devices(
9163 ctxt, pci_requests, dest_topo)
9165 for pci_dev in claimed_pci_devices_objs:
9166 LOG.debug("PCI device: %s Claimed on destination node",
9167 pci_dev.address)
9168 return claimed_pci_devices_objs
9170 def _live_migration_claim(self, ctxt, instance, migrate_data,
9171 migration, limits, allocs):
9172 """Runs on the destination and does a resources claim, if necessary.
9173 Currently, only NUMA live migrations require it.
9175 :param ctxt: Request context
9176 :param instance: The Instance being live migrated
9177 :param migrate_data: The MigrateData object for this live migration
9178 :param migration: The Migration object for this live migration
9179 :param limits: The SchedulerLimits object for this live migration
9180 :returns: migrate_data with dst_numa_info set if necessary
9181 """
9182 try:
9183 # NOTE(artom) We might have gotten here from _find_destination() in
9184 # the conductor live migrate task. At that point,
9185 # migration.dest_node is not set yet (nor should it be, we're still
9186 # looking for a destination, after all). Therefore, we cannot use
9187 # migration.dest_node here and must use self._get_nodename().
9188 claim = self.rt.live_migration_claim(
9189 ctxt, instance, self._get_nodename(instance), migration,
9190 limits, allocs)
9191 LOG.debug('Created live migration claim.', instance=instance)
9192 except exception.ComputeResourcesUnavailable as e:
9193 raise exception.MigrationPreCheckError(
9194 reason=e.format_message())
9195 return self.driver.post_claim_migrate_data(ctxt, instance,
9196 migrate_data, claim)
9198 def _source_can_numa_live_migrate(self, ctxt, dest_check_data,
9199 source_check_data):
9200 # TODO(artom) Our virt driver may have told us that it supports NUMA
9201 # live migration. However, the following other conditions must be met
9202 # for a NUMA live migration to happen:
9203 # 1. We got a True dst_supports_numa_live_migration in
9204 # dest_check_data, indicating that the dest virt driver supports
9205 # NUMA live migration and that the conductor can send RPC 5.3 and
9206 # that the destination compute manager can receive it.
9207 # 2. Ourselves, the source, can send RPC 5.3. There's no
9208 # sentinel/parameter for this, so we just ask our rpcapi directly.
9209 # If any of these are not met, we need to remove the
9210 # src_supports_numa_live_migration flag from source_check_data to avoid
9211 # incorrectly initiating a NUMA live migration.
9212 # All of this can be removed in RPC 6.0/objects 2.0.
9213 can_numa_live_migrate = (
9214 'dst_supports_numa_live_migration' in dest_check_data and
9215 dest_check_data.dst_supports_numa_live_migration and
9216 self.compute_rpcapi.supports_numa_live_migration(ctxt))
9217 if ('src_supports_numa_live_migration' in source_check_data and
9218 source_check_data.src_supports_numa_live_migration and
9219 not can_numa_live_migrate):
9220 delattr(source_check_data, 'src_supports_numa_live_migration')
9221 return source_check_data
9223 @wrap_exception()
9224 @wrap_instance_event(prefix='compute')
9225 @wrap_instance_fault
9226 def check_can_live_migrate_source(self, ctxt, instance, dest_check_data):
9227 """Check if it is possible to execute live migration.
9229 This checks if the live migration can succeed, based on the
9230 results from check_can_live_migrate_destination.
9232 :param ctxt: security context
9233 :param instance: dict of instance data
9234 :param dest_check_data: result of check_can_live_migrate_destination
9235 :returns: a LiveMigrateData object
9236 """
9237 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
9238 ctxt, instance.uuid)
9239 is_volume_backed = compute_utils.is_volume_backed_instance(
9240 ctxt, instance, bdms)
9241 dest_check_data.is_volume_backed = is_volume_backed
9242 block_device_info = self._get_instance_block_device_info(
9243 ctxt, instance, refresh_conn_info=False, bdms=bdms)
9244 result = self.driver.check_can_live_migrate_source(ctxt, instance,
9245 dest_check_data,
9246 block_device_info)
9247 result = self._source_can_numa_live_migrate(ctxt, dest_check_data,
9248 result)
9249 LOG.debug('source check data is %s', result)
9250 return result
9252 @wrap_exception()
9253 @wrap_instance_event(prefix='compute')
9254 @wrap_instance_fault
9255 def pre_live_migration(self, context, instance, disk, migrate_data):
9256 """Preparations for live migration at dest host.
9258 :param context: security context
9259 :param instance: dict of instance data
9260 :param disk: disk info of instance
9261 :param migrate_data: A dict or LiveMigrateData object holding data
9262 required for live migration without shared
9263 storage.
9264 :returns: migrate_data containing additional migration info
9265 """
9266 LOG.debug('pre_live_migration data is %s', migrate_data)
9268 # Error out if this host cannot accept the new instance due
9269 # to anti-affinity. At this point the migration is already in-progress,
9270 # so this is the definitive moment to abort due to the policy
9271 # violation. Also, it should be safe to explode here. The instance
9272 # status remains ACTIVE, migration status failed.
9273 self._validate_instance_group_policy(context, instance)
9275 migrate_data.old_vol_attachment_ids = {}
9276 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
9277 context, instance.uuid)
9278 network_info = self.network_api.get_instance_nw_info(context, instance)
9279 self._notify_about_instance_usage(
9280 context, instance, "live_migration.pre.start",
9281 network_info=network_info)
9282 compute_utils.notify_about_instance_action(
9283 context, instance, self.host,
9284 action=fields.NotificationAction.LIVE_MIGRATION_PRE,
9285 phase=fields.NotificationPhase.START, bdms=bdms)
9287 connector = None
9288 try:
9289 for bdm in bdms:
9290 if bdm.is_volume and bdm.attachment_id is not None:
9291 # This bdm uses the new cinder v3.44 API.
9292 # We will create a new attachment for this
9293 # volume on this migration destination host. The old
9294 # attachment will be deleted on the source host
9295 # when the migration succeeds. The old attachment_id
9296 # is stored in dict with the key being the bdm.volume_id
9297 # so it can be restored on rollback.
9298 #
9299 # Also note that attachment_update is not needed as we
9300 # are providing the connector in the create call.
9301 if connector is None:
9302 connector = self.driver.get_volume_connector(instance)
9303 attach_ref = self.volume_api.attachment_create(
9304 context, bdm.volume_id, bdm.instance_uuid,
9305 connector=connector, mountpoint=bdm.device_name)
9307 # save current attachment so we can detach it on success,
9308 # or restore it on a rollback.
9309 # NOTE(mdbooth): This data is no longer used by the source
9310 # host since change Ibe9215c0. We can't remove it until we
9311 # are sure the source host has been upgraded.
9312 migrate_data.old_vol_attachment_ids[bdm.volume_id] = \
9313 bdm.attachment_id
9315 # update the bdm with the new attachment_id.
9316 bdm.attachment_id = attach_ref['id']
9317 bdm.save()
9319 block_device_info = self._get_instance_block_device_info(
9320 context, instance, refresh_conn_info=True,
9321 bdms=bdms)
9323 # The driver pre_live_migration will plug vifs on the host
9324 migrate_data = self.driver.pre_live_migration(context,
9325 instance,
9326 block_device_info,
9327 network_info,
9328 disk,
9329 migrate_data)
9330 LOG.debug('driver pre_live_migration data is %s', migrate_data)
9331 # driver.pre_live_migration is what plugs vifs on the destination
9332 # host so now we can set the wait_for_vif_plugged flag in the
9333 # migrate_data object which the source compute will use to
9334 # determine if it should wait for a 'network-vif-plugged' event
9335 # from neutron before starting the actual guest transfer in the
9336 # hypervisor
9337 using_multiple_port_bindings = (
9338 'vifs' in migrate_data and migrate_data.vifs)
9339 migrate_data.wait_for_vif_plugged = (
9340 CONF.compute.live_migration_wait_for_vif_plug and
9341 using_multiple_port_bindings
9342 )
9344 # NOTE(tr3buchet): setup networks on destination host
9345 self.network_api.setup_networks_on_host(context, instance,
9346 self.host)
9348 # NOTE(lyarwood): The above call to driver.pre_live_migration
9349 # can result in the virt drivers attempting to stash additional
9350 # metadata into the connection_info of the underlying bdm.
9351 # Ensure this is saved to the database by calling .save() against
9352 # the driver BDMs we passed down via block_device_info.
9353 for driver_bdm in block_device_info['block_device_mapping']:
9354 driver_bdm.save()
9356 except Exception:
9357 # If we raise, migrate_data with the updated attachment ids
9358 # will not be returned to the source host for rollback.
9359 # So we need to rollback new attachments here.
9360 with excutils.save_and_reraise_exception():
9361 old_attachments = migrate_data.old_vol_attachment_ids
9362 for bdm in bdms:
9363 if (bdm.is_volume and bdm.attachment_id is not None and
9364 bdm.volume_id in old_attachments):
9365 self.volume_api.attachment_delete(context,
9366 bdm.attachment_id)
9367 bdm.attachment_id = old_attachments[bdm.volume_id]
9368 bdm.save()
9370 # Volume connections are complete, tell cinder that all the
9371 # attachments have completed.
9372 for bdm in bdms:
9373 if bdm.is_volume and bdm.attachment_id is not None:
9374 self.volume_api.attachment_complete(context,
9375 bdm.attachment_id)
9377 self._notify_about_instance_usage(
9378 context, instance, "live_migration.pre.end",
9379 network_info=network_info)
9380 compute_utils.notify_about_instance_action(
9381 context, instance, self.host,
9382 action=fields.NotificationAction.LIVE_MIGRATION_PRE,
9383 phase=fields.NotificationPhase.END, bdms=bdms)
9385 LOG.debug('pre_live_migration result data is %s', migrate_data)
9386 return migrate_data
9388 @staticmethod
9389 def _neutron_failed_live_migration_callback(event_name, instance):
9390 msg = ('Neutron reported failure during live migration '
9391 'with %(event)s for instance %(uuid)s')
9392 msg_args = {'event': event_name, 'uuid': instance.uuid}
9393 if CONF.vif_plugging_is_fatal:
9394 raise exception.VirtualInterfacePlugException(msg % msg_args)
9395 LOG.error(msg, msg_args)
9397 @staticmethod
9398 def _get_neutron_events_for_live_migration(instance):
9399 # We don't generate events if CONF.vif_plugging_timeout=0
9400 # meaning that the operator disabled using them.
9401 if CONF.vif_plugging_timeout:
9402 return (instance.get_network_info()
9403 .get_live_migration_plug_time_events())
9404 else:
9405 return []
9407 def _cleanup_pre_live_migration(self, context, dest, instance,
9408 migration, migrate_data, source_bdms):
9409 """Helper method for when pre_live_migration fails
9411 Sets the migration status to "error" and rolls back the live migration
9412 setup on the destination host.
9414 :param context: The user request context.
9415 :type context: nova.context.RequestContext
9416 :param dest: The live migration destination hostname.
9417 :type dest: str
9418 :param instance: The instance being live migrated.
9419 :type instance: nova.objects.Instance
9420 :param migration: The migration record tracking this live migration.
9421 :type migration: nova.objects.Migration
9422 :param migrate_data: Data about the live migration, populated from
9423 the destination host.
9424 :type migrate_data: Subclass of nova.objects.LiveMigrateData
9425 :param source_bdms: BDMs prior to modification by the destination
9426 compute host. Set by _do_live_migration and not
9427 part of the callback interface, so this is never
9428 None
9429 """
9430 self._set_migration_status(migration, 'error')
9431 # Make sure we set this for _rollback_live_migration()
9432 # so it can find it, as expected if it was called later
9433 migrate_data.migration = migration
9434 self._rollback_live_migration(context, instance, dest,
9435 migrate_data=migrate_data,
9436 source_bdms=source_bdms,
9437 pre_live_migration=True)
9439 def _do_pre_live_migration_from_source(self, context, dest, instance,
9440 block_migration, migration,
9441 migrate_data, source_bdms):
9442 """Prepares for pre-live-migration on the source host and calls dest
9444 Will setup a callback networking event handler (if configured) and
9445 then call the dest host's pre_live_migration method to prepare the
9446 dest host for live migration (plugs vifs, connect volumes, etc).
9448 _rollback_live_migration (on the source) will be called if
9449 pre_live_migration (on the dest) fails.
9451 :param context: nova auth request context for this operation
9452 :param dest: name of the destination compute service host
9453 :param instance: Instance object being live migrated
9454 :param block_migration: If true, prepare for block migration.
9455 :param migration: Migration object tracking this operation
9456 :param migrate_data: MigrateData object for this operation populated
9457 by the destination host compute driver as part of the
9458 check_can_live_migrate_destination call.
9459 :param source_bdms: BlockDeviceMappingList of BDMs currently attached
9460 to the instance from the source host.
9461 :returns: MigrateData object which is a modified version of the
9462 ``migrate_data`` argument from the compute driver on the dest
9463 host during the ``pre_live_migration`` call.
9464 :raises: MigrationError if waiting for the network-vif-plugged event
9465 timed out and is fatal.
9466 """
9467 class _BreakWaitForInstanceEvent(Exception):
9468 """Used as a signal to stop waiting for the network-vif-plugged
9469 event when we discover that
9470 [compute]/live_migration_wait_for_vif_plug is not set on the
9471 destination.
9472 """
9473 pass
9475 events = self._get_neutron_events_for_live_migration(instance)
9476 try:
9477 if ('block_migration' in migrate_data and
9478 migrate_data.block_migration):
9479 block_device_info = self._get_instance_block_device_info(
9480 context, instance, bdms=source_bdms)
9481 disk = self.driver.get_instance_disk_info(
9482 instance, block_device_info=block_device_info)
9483 else:
9484 disk = None
9486 deadline = CONF.vif_plugging_timeout
9487 error_cb = self._neutron_failed_live_migration_callback
9488 # In order to avoid a race with the vif plugging that the virt
9489 # driver does on the destination host, we register our events
9490 # to wait for before calling pre_live_migration. Then if the
9491 # dest host reports back that we shouldn't wait, we can break
9492 # out of the context manager using _BreakWaitForInstanceEvent.
9493 with self.virtapi.wait_for_instance_event(
9494 instance, events, deadline=deadline,
9495 error_callback=error_cb):
9496 with timeutils.StopWatch() as timer:
9497 # TODO(mriedem): The "block_migration" parameter passed
9498 # here is not actually used in pre_live_migration but it
9499 # is not optional in the RPC interface either.
9500 migrate_data = self.compute_rpcapi.pre_live_migration(
9501 context, instance,
9502 block_migration, disk, dest, migrate_data)
9503 LOG.info('Took %0.2f seconds for pre_live_migration on '
9504 'destination host %s.',
9505 timer.elapsed(), dest, instance=instance)
9506 wait_for_vif_plugged = (
9507 'wait_for_vif_plugged' in migrate_data and
9508 migrate_data.wait_for_vif_plugged)
9509 if events and not wait_for_vif_plugged:
9510 raise _BreakWaitForInstanceEvent
9511 except _BreakWaitForInstanceEvent:
9512 if events: 9512 ↛ 9548line 9512 didn't jump to line 9548 because the condition on line 9512 was always true
9513 LOG.debug('Not waiting for events after pre_live_migration: '
9514 '%s. ', events, instance=instance)
9515 except exception.VirtualInterfacePlugException:
9516 with excutils.save_and_reraise_exception():
9517 LOG.exception('Failed waiting for network virtual interfaces '
9518 'to be plugged on the destination host %s.',
9519 dest, instance=instance)
9520 self._cleanup_pre_live_migration(
9521 context, dest, instance, migration, migrate_data,
9522 source_bdms)
9523 except eventlet.timeout.Timeout:
9524 # We only get here if wait_for_vif_plugged is True which means
9525 # live_migration_wait_for_vif_plug=True on the destination host.
9526 msg = (
9527 'Timed out waiting for events: %(events)s. If these timeouts '
9528 'are a persistent issue it could mean the networking backend '
9529 'on host %(dest)s does not support sending these events '
9530 'unless there are port binding host changes which does not '
9531 'happen at this point in the live migration process. You may '
9532 'need to disable the live_migration_wait_for_vif_plug option '
9533 'on host %(dest)s.')
9534 subs = {'events': events, 'dest': dest}
9535 LOG.warning(msg, subs, instance=instance)
9536 if CONF.vif_plugging_is_fatal:
9537 self._cleanup_pre_live_migration(
9538 context, dest, instance, migration, migrate_data,
9539 source_bdms)
9540 raise exception.MigrationError(reason=msg % subs)
9541 except Exception:
9542 with excutils.save_and_reraise_exception():
9543 LOG.exception('Pre live migration failed at %s',
9544 dest, instance=instance)
9545 self._cleanup_pre_live_migration(
9546 context, dest, instance, migration, migrate_data,
9547 source_bdms)
9548 return migrate_data
9550 def _do_live_migration(self, context, dest, instance, block_migration,
9551 migration, migrate_data):
9552 # NOTE(danms): We should enhance the RT to account for migrations
9553 # and use the status field to denote when the accounting has been
9554 # done on source/destination. For now, this is just here for status
9555 # reporting
9556 self._set_migration_status(migration, 'preparing')
9557 source_bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
9558 context, instance.uuid)
9560 migrate_data = self._do_pre_live_migration_from_source(
9561 context, dest, instance, block_migration, migration, migrate_data,
9562 source_bdms)
9564 # Set migrate_data.migration because that is how _post_live_migration
9565 # and _rollback_live_migration get the migration object for cleanup.
9566 # Yes this is gross but changing the _post_live_migration and
9567 # _rollback_live_migration interfaces would also mean changing how the
9568 # virt drivers call them from the driver.live_migration method, i.e.
9569 # we would have to pass the migration object through the driver (or
9570 # consider using a partial but some do not like that pattern).
9571 migrate_data.migration = migration
9573 # NOTE(Kevin_Zheng): Pop the migration from the waiting queue
9574 # if it exist in the queue, then we are good to moving on, if
9575 # not, some other process must have aborted it, then we should
9576 # rollback.
9577 try:
9578 self._waiting_live_migrations.pop(instance.uuid)
9579 except KeyError:
9580 LOG.debug('Migration %s aborted by another process, rollback.',
9581 migration.uuid, instance=instance)
9582 self._rollback_live_migration(context, instance, dest,
9583 migrate_data, 'cancelled',
9584 source_bdms=source_bdms)
9585 self._notify_live_migrate_abort_end(context, instance)
9586 return
9588 self._set_migration_status(migration, 'running')
9590 # NOTE(mdbooth): pre_live_migration will update connection_info and
9591 # attachment_id on all volume BDMS to reflect the new destination
9592 # host attachment. We fetch BDMs before that to retain connection_info
9593 # and attachment_id relating to the source host for post migration
9594 # cleanup.
9595 post_live_migration = functools.partial(
9596 self._post_live_migration_update_host, source_bdms=source_bdms
9597 )
9598 rollback_live_migration = functools.partial(
9599 self._rollback_live_migration, source_bdms=source_bdms)
9601 LOG.debug('live_migration data is %s', migrate_data)
9602 try:
9603 self.driver.live_migration(context, instance, dest,
9604 post_live_migration,
9605 rollback_live_migration,
9606 block_migration, migrate_data)
9607 except Exception:
9608 LOG.exception('Live migration failed.', instance=instance)
9609 with excutils.save_and_reraise_exception():
9610 # Put instance and migration into error state,
9611 # as its almost certainly too late to rollback
9612 self._set_migration_status(migration, 'error')
9613 # first refresh instance as it may have got updated by
9614 # post_live_migration_at_destination
9615 instance.refresh()
9616 self._set_instance_obj_error_state(instance,
9617 clean_task_state=True)
9619 @wrap_exception()
9620 @wrap_instance_event(prefix='compute')
9621 @errors_out_migration
9622 @wrap_instance_fault
9623 def live_migration(self, context, dest, instance, block_migration,
9624 migration, migrate_data):
9625 """Executing live migration.
9627 :param context: security context
9628 :param dest: destination host
9629 :param instance: a nova.objects.instance.Instance object
9630 :param block_migration: if true, prepare for block migration
9631 :param migration: an nova.objects.Migration object
9632 :param migrate_data: implementation specific params
9634 """
9635 self._set_migration_status(migration, 'queued')
9636 # NOTE(Kevin_Zheng): Submit the live_migration job to the pool and
9637 # put the returned Future object into dict mapped with migration.uuid
9638 # in order to be able to track and abort it in the future.
9639 self._waiting_live_migrations[instance.uuid] = (None, None)
9640 try:
9641 future = nova.utils.pass_context(
9642 self._live_migration_executor.submit,
9643 self._do_live_migration, context, dest, instance,
9644 block_migration, migration, migrate_data)
9645 self._waiting_live_migrations[instance.uuid] = (migration, future)
9646 except RuntimeError:
9647 # GreenThreadPoolExecutor.submit will raise RuntimeError if the
9648 # pool is shutdown, which happens in
9649 # _cleanup_live_migrations_in_pool.
9650 LOG.info('Migration %s failed to submit as the compute service '
9651 'is shutting down.', migration.uuid, instance=instance)
9652 raise exception.LiveMigrationNotSubmitted(
9653 migration_uuid=migration.uuid, instance_uuid=instance.uuid)
9655 @wrap_exception()
9656 @wrap_instance_event(prefix='compute')
9657 @wrap_instance_fault
9658 def live_migration_force_complete(self, context, instance):
9659 """Force live migration to complete.
9661 :param context: Security context
9662 :param instance: The instance that is being migrated
9663 """
9665 self._notify_about_instance_usage(
9666 context, instance, 'live.migration.force.complete.start')
9667 compute_utils.notify_about_instance_action(
9668 context, instance, self.host,
9669 action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,
9670 phase=fields.NotificationPhase.START)
9671 self.driver.live_migration_force_complete(instance)
9672 self._notify_about_instance_usage(
9673 context, instance, 'live.migration.force.complete.end')
9674 compute_utils.notify_about_instance_action(
9675 context, instance, self.host,
9676 action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,
9677 phase=fields.NotificationPhase.END)
9679 def _notify_live_migrate_abort_end(self, context, instance):
9680 self._notify_about_instance_usage(
9681 context, instance, 'live.migration.abort.end')
9682 compute_utils.notify_about_instance_action(
9683 context, instance, self.host,
9684 action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
9685 phase=fields.NotificationPhase.END)
9687 @wrap_exception()
9688 @wrap_instance_event(prefix='compute')
9689 @wrap_instance_fault
9690 def live_migration_abort(self, context, instance, migration_id):
9691 """Abort an in-progress live migration.
9693 :param context: Security context
9694 :param instance: The instance that is being migrated
9695 :param migration_id: ID of in-progress live migration
9697 """
9698 self._notify_about_instance_usage(
9699 context, instance, 'live.migration.abort.start')
9700 compute_utils.notify_about_instance_action(
9701 context, instance, self.host,
9702 action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
9703 phase=fields.NotificationPhase.START)
9704 # NOTE(Kevin_Zheng): Pop the migration out from the queue, this might
9705 # lead to 3 scenarios:
9706 # 1. The selected migration is still in queue, and the future.cancel()
9707 # succeed, then the abort action is succeed, mark the migration
9708 # status to 'cancelled'.
9709 # 2. The selected migration is still in queue, but the future.cancel()
9710 # failed, then the _do_live_migration() has started executing, and
9711 # the migration status is 'preparing', then we just pop it from the
9712 # queue, and the migration process will handle it later. And the
9713 # migration status couldn't be 'running' in this scenario because
9714 # if _do_live_migration has started executing and we've already
9715 # popped it from the queue and set the migration status to
9716 # 'running' at this point, popping it here will raise KeyError at
9717 # which point we check if it's running and if so, we abort the old
9718 # way.
9719 # 3. The selected migration is not in the queue, then the migration
9720 # status is 'running', let the driver handle it.
9721 try:
9722 migration, future = (
9723 self._waiting_live_migrations.pop(instance.uuid))
9724 if future and future.cancel(): 9724 ↛ 9768line 9724 didn't jump to line 9768 because the condition on line 9724 was always true
9725 # If we got here, we've successfully dropped a queued
9726 # migration from the queue, so _do_live_migration won't run
9727 # and we only need to revert minor changes introduced by Nova
9728 # control plane (port bindings, resource allocations and
9729 # instance's PCI devices), restore VM's state, set the
9730 # migration's status to cancelled and send the notification.
9731 # If Future.cancel() fails, it means _do_live_migration is
9732 # running and the migration status is preparing, and
9733 # _do_live_migration() itself will attempt to pop the queued
9734 # migration, hit a KeyError, and rollback, set the migration
9735 # to cancelled and send the live.migration.abort.end
9736 # notification.
9737 self._revert_allocation(context, instance, migration)
9738 try:
9739 # This call will delete any inactive destination host
9740 # port bindings.
9741 self.network_api.setup_networks_on_host(
9742 context, instance, host=migration.dest_compute,
9743 teardown=True)
9744 except exception.PortBindingDeletionFailed as e:
9745 # Removing the inactive port bindings from the destination
9746 # host is not critical so just log an error but don't fail.
9747 LOG.error(
9748 'Network cleanup failed for destination host %s '
9749 'during live migration rollback. You may need to '
9750 'manually clean up resources in the network service. '
9751 'Error: %s', migration.dest_compute, str(e))
9752 except Exception:
9753 with excutils.save_and_reraise_exception():
9754 LOG.exception(
9755 'An error occurred while cleaning up networking '
9756 'during live migration rollback.',
9757 instance=instance)
9758 instance.task_state = None
9759 instance.save(expected_task_state=[task_states.MIGRATING])
9760 self._set_migration_status(migration, 'cancelled')
9761 except KeyError:
9762 migration = objects.Migration.get_by_id(context, migration_id)
9763 if migration.status != 'running':
9764 raise exception.InvalidMigrationState(
9765 migration_id=migration_id, instance_uuid=instance.uuid,
9766 state=migration.status, method='abort live migration')
9767 self.driver.live_migration_abort(instance)
9768 self._notify_live_migrate_abort_end(context, instance)
9770 def _live_migration_cleanup_flags(self, migrate_data, migr_ctxt=None):
9771 """Determine whether disks, instance path or other resources
9772 need to be cleaned up after live migration (at source on success,
9773 at destination on rollback)
9775 Block migration needs empty image at destination host before migration
9776 starts, so if any failure occurs, any empty images has to be deleted.
9778 Also Volume backed live migration w/o shared storage needs to delete
9779 newly created instance-xxx dir on the destination as a part of its
9780 rollback process
9782 There may be other resources which need cleanup; currently this is
9783 limited to vPMEM and mdev devices with the libvirt driver.
9785 :param migrate_data: implementation specific data
9786 :param migr_ctxt: specific resources stored in migration_context
9787 :returns: (bool, bool) -- do_cleanup, destroy_disks
9788 """
9789 # NOTE(pkoniszewski): block migration specific params are set inside
9790 # migrate_data objects for drivers that expose block live migration
9791 # information (i.e. Libvirt, HyperV). For other drivers cleanup is not
9792 # needed.
9793 do_cleanup = False
9794 destroy_disks = False
9795 if isinstance(migrate_data, migrate_data_obj.LibvirtLiveMigrateData):
9796 has_vpmem = False
9797 if migr_ctxt and migr_ctxt.old_resources:
9798 for resource in migr_ctxt.old_resources: 9798 ↛ 9804line 9798 didn't jump to line 9804 because the loop on line 9798 didn't complete
9799 if ('metadata' in resource and 9799 ↛ 9798line 9799 didn't jump to line 9798 because the condition on line 9799 was always true
9800 isinstance(resource.metadata,
9801 objects.LibvirtVPMEMDevice)):
9802 has_vpmem = True
9803 break
9804 has_mdevs = 'target_mdevs' in migrate_data
9805 power_management_possible = (
9806 'dst_numa_info' in migrate_data and
9807 migrate_data.dst_numa_info is not None)
9808 # No instance booting at source host, but instance dir
9809 # must be deleted for preparing next block migration
9810 # must be deleted for preparing next live migration w/o shared
9811 # storage
9812 # vpmem must be cleaned
9813 do_cleanup = (not migrate_data.is_shared_instance_path or
9814 has_vpmem or has_mdevs or power_management_possible)
9815 destroy_disks = not (
9816 migrate_data.is_shared_block_storage or
9817 migrate_data.is_shared_instance_path)
9818 elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):
9819 # NOTE(claudiub): We need to cleanup any zombie Planned VM.
9820 do_cleanup = True
9821 destroy_disks = not migrate_data.is_shared_instance_path
9823 return (do_cleanup, destroy_disks)
9825 def _post_live_migration_remove_source_vol_connections(
9826 self, context, instance, source_bdms):
9827 """Disconnect volume connections from the source host during
9828 _post_live_migration.
9830 :param context: nova auth RequestContext
9831 :param instance: Instance object being live migrated
9832 :param source_bdms: BlockDeviceMappingList representing the attached
9833 volumes with connection_info set for the source host
9834 """
9835 # Detaching volumes.
9836 connector = None
9837 for bdm in source_bdms:
9838 if bdm.is_volume:
9839 # Detaching volumes is a call to an external API that can fail.
9840 # If it does, we need to handle it gracefully so that the call
9841 # to post_live_migration_at_destination - where we set instance
9842 # host and task state - still happens. We need to rethink the
9843 # current approach of setting instance host and task state
9844 # AFTER a whole bunch of things that could fail in unhandled
9845 # ways, but that is left as a TODO(artom).
9846 try:
9847 if bdm.attachment_id is None:
9848 # Prior to cinder v3.44:
9849 # We don't want to actually mark the volume detached,
9850 # or delete the bdm, just remove the connection from
9851 # this host.
9852 #
9853 # remove the volume connection without detaching from
9854 # hypervisor because the instance is not running
9855 # anymore on the current host
9856 if connector is None: 9856 ↛ 9859line 9856 didn't jump to line 9859 because the condition on line 9856 was always true
9857 connector = self.driver.get_volume_connector(
9858 instance)
9859 self.volume_api.terminate_connection(context,
9860 bdm.volume_id,
9861 connector)
9862 else:
9863 # cinder v3.44 api flow - delete the old attachment
9864 # for the source host
9865 self.volume_api.attachment_delete(context,
9866 bdm.attachment_id)
9868 except Exception as e:
9869 if bdm.attachment_id is None:
9870 LOG.error('Connection for volume %s not terminated on '
9871 'source host %s during post_live_migration: '
9872 '%s', bdm.volume_id, self.host,
9873 str(e), instance=instance)
9874 else:
9875 LOG.error('Volume attachment %s not deleted on source '
9876 'host %s during post_live_migration: %s',
9877 bdm.attachment_id, self.host,
9878 str(e), instance=instance)
9880 # TODO(sean-k-mooney): add typing
9881 def _post_live_migration_update_host(
9882 self, ctxt, instance, dest, block_migration=False,
9883 migrate_data=None, source_bdms=None
9884 ):
9885 try:
9886 self._post_live_migration(
9887 ctxt, instance, dest, block_migration, migrate_data,
9888 source_bdms)
9889 except Exception:
9890 # Restore the instance object
9891 compute_node = None
9892 node_name = None
9893 try:
9894 # get node name of compute, where instance will be
9895 # running after migration, that is destination host
9896 compute_node = self._get_compute_info(ctxt, dest)
9897 node_name = compute_node.hypervisor_hostname
9898 except exception.ComputeHostNotFound:
9899 LOG.exception('Failed to get compute_info for %s', dest)
9901 # we can never rollback from post live migration and we can only
9902 # get here if the instance is running on the dest so we ensure
9903 # the instance.host is set correctly and reraise the original
9904 # exception unmodified.
9905 if instance.host != dest: 9905 ↛ 9916line 9905 didn't jump to line 9916 because the condition on line 9905 was always true
9906 # apply saves the new fields while drop actually removes the
9907 # migration context from the instance, so migration persists.
9908 instance.apply_migration_context()
9909 instance.drop_migration_context()
9910 instance.host = dest
9911 instance.task_state = None
9912 instance.node = node_name
9913 instance.compute_id = compute_node and compute_node.id or None
9914 instance.progress = 0
9915 instance.save()
9916 raise
9918 @wrap_exception()
9919 @wrap_instance_fault
9920 def _post_live_migration(self, ctxt, instance, dest,
9921 block_migration=False, migrate_data=None,
9922 source_bdms=None):
9923 """Post operations for live migration.
9925 This method is called from live_migration
9926 and mainly updating database record.
9928 :param ctxt: security context
9929 :param instance: instance object
9930 :param dest: destination host
9931 :param block_migration: if true, prepare for block migration
9932 :param migrate_data: if not None, it is a dict which has data
9933 :param source_bdms: BDMs prior to modification by the destination
9934 compute host. Set by _do_live_migration and not
9935 part of the callback interface, so this is never
9936 None
9937 required for live migration without shared storage
9939 """
9940 LOG.info('_post_live_migration() is started..',
9941 instance=instance)
9943 # NOTE(artom) The ordering and exception handling are important here.
9944 # We want to immediately activate the port bindings on the destination
9945 # host to minimize network downtime. This happens in
9946 # migrate_instance_start(). It's also crucial that we call
9947 # _post_live_migration_remove_source_vol_connections() to clean up
9948 # source volume connections and prevent potential data leaks. We
9949 # therefore activate the port bindings in a try block, and, regardless
9950 # of any exceptions during that process, clean up volume connections in
9951 # a finally block.
9952 try:
9953 # NOTE(artom) At this point in time we have not bound the ports to
9954 # the destination host yet (this happens in
9955 # migrate_instance_start() below). Therefore, the "old" source
9956 # network info that's still in the instance info cache is safe to
9957 # use here, since it'll be used below during
9958 # driver.post_live_migration_at_source() to unplug the VIFs on the
9959 # source.
9960 network_info = instance.get_network_info()
9962 self._notify_about_instance_usage(ctxt, instance,
9963 "live_migration._post.start",
9964 network_info=network_info,
9965 best_effort=True)
9966 compute_utils.notify_about_instance_action(
9967 ctxt, instance, self.host,
9968 action=fields.NotificationAction.LIVE_MIGRATION_POST,
9969 phase=fields.NotificationPhase.START, best_effort=True)
9971 migration = objects.Migration(
9972 source_compute=self.host, dest_compute=dest,
9973 )
9974 # For neutron, migrate_instance_start will activate the destination
9975 # host port bindings, if there are any created by conductor before
9976 # live migration started.
9977 self.network_api.migrate_instance_start(ctxt, instance, migration)
9978 finally:
9979 # Cleanup source host post live-migration
9980 block_device_info = self._get_instance_block_device_info(
9981 ctxt, instance, bdms=source_bdms)
9982 self.driver.post_live_migration(ctxt, instance, block_device_info,
9983 migrate_data)
9985 # Disconnect volumes from this (the source) host.
9986 self._post_live_migration_remove_source_vol_connections(
9987 ctxt, instance, source_bdms)
9989 destroy_vifs = False
9990 try:
9991 # It's possible that the vif type changed on the destination
9992 # host and is already bound and active, so we need to use the
9993 # stashed source vifs in migrate_data.vifs (if present) to unplug
9994 # on the source host.
9995 unplug_nw_info = network_info
9996 if migrate_data and 'vifs' in migrate_data:
9997 nw_info = []
9998 for migrate_vif in migrate_data.vifs:
9999 nw_info.append(migrate_vif.source_vif)
10000 unplug_nw_info = network_model.NetworkInfo.hydrate(nw_info)
10001 LOG.debug('Calling driver.post_live_migration_at_source '
10002 'with original source VIFs from migrate_data: %s',
10003 unplug_nw_info, instance=instance)
10004 self.driver.post_live_migration_at_source(ctxt, instance,
10005 unplug_nw_info)
10006 except NotImplementedError as ex:
10007 LOG.debug(ex, instance=instance)
10008 # For all hypervisors other than libvirt, there is a possibility
10009 # they are unplugging networks from source node in the cleanup
10010 # method
10011 destroy_vifs = True
10013 # Free instance allocations on source before claims are allocated on
10014 # destination node
10015 self.rt.free_pci_device_allocations_for_instance(ctxt, instance)
10016 # NOTE(danms): Save source node before calling post method on
10017 # destination, which will update it
10018 source_node = instance.node
10020 do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
10021 migrate_data, migr_ctxt=instance.migration_context)
10023 if do_cleanup:
10024 LOG.debug('Calling driver.cleanup from _post_live_migration',
10025 instance=instance)
10026 self.driver.cleanup(ctxt, instance, unplug_nw_info,
10027 destroy_disks=destroy_disks,
10028 migrate_data=migrate_data,
10029 destroy_vifs=destroy_vifs)
10031 # Define domain at destination host, without doing it,
10032 # pause/suspend/terminate do not work.
10033 post_at_dest_success = True
10034 try:
10035 self.compute_rpcapi.post_live_migration_at_destination(ctxt,
10036 instance, block_migration, dest)
10037 except Exception as error:
10038 post_at_dest_success = False
10039 # If post_live_migration_at_destination() fails, we now have the
10040 # _post_live_migration_update_host() method that will handle
10041 # this case.
10042 LOG.exception("Post live migration at destination %s failed",
10043 dest, instance=instance, error=error)
10044 raise
10046 self.instance_events.clear_events_for_instance(instance)
10048 # NOTE(timello): make sure we update available resources on source
10049 # host even before next periodic task.
10050 self.update_available_resource(ctxt)
10052 self._update_scheduler_instance_info(ctxt, instance)
10053 self._notify_about_instance_usage(ctxt, instance,
10054 "live_migration._post.end",
10055 network_info=network_info)
10056 compute_utils.notify_about_instance_action(
10057 ctxt, instance, self.host,
10058 action=fields.NotificationAction.LIVE_MIGRATION_POST,
10059 phase=fields.NotificationPhase.END)
10060 if post_at_dest_success: 10060 ↛ 10064line 10060 didn't jump to line 10064 because the condition on line 10060 was always true
10061 LOG.info('Migrating instance to %s finished successfully.',
10062 dest, instance=instance)
10064 self._clean_instance_console_tokens(ctxt, instance)
10065 if migrate_data and migrate_data.obj_attr_is_set('migration'):
10066 migrate_data.migration.status = 'completed'
10067 migrate_data.migration.save()
10068 self._delete_allocation_after_move(ctxt,
10069 instance,
10070 migrate_data.migration)
10071 else:
10072 # We didn't have data on a migration, which means we can't
10073 # look up to see if we had new-style migration-based
10074 # allocations. This should really only happen in cases of
10075 # a buggy virt driver. Log a warning so we know it happened.
10076 LOG.warning('Live migration ended with no migrate_data '
10077 'record. Unable to clean up migration-based '
10078 'allocations for node %s which is almost certainly '
10079 'not an expected situation.', source_node,
10080 instance=instance)
10082 def _consoles_enabled(self):
10083 """Returns whether a console is enable."""
10084 return (CONF.vnc.enabled or CONF.spice.enabled or
10085 CONF.serial_console.enabled or
10086 CONF.mks.enabled)
10088 def _clean_instance_console_tokens(self, ctxt, instance):
10089 """Clean console tokens stored for an instance."""
10090 # If the database backend isn't in use, don't bother trying to clean
10091 # tokens.
10092 if self._consoles_enabled():
10093 objects.ConsoleAuthToken.\
10094 clean_console_auths_for_instance(ctxt, instance.uuid)
10096 @wrap_exception()
10097 @wrap_instance_event(prefix='compute')
10098 @wrap_instance_fault
10099 def post_live_migration_at_destination(self, context, instance,
10100 block_migration):
10101 """Post operations for live migration .
10103 :param context: security context
10104 :param instance: Instance dict
10105 :param block_migration: if true, prepare for block migration
10107 """
10108 LOG.info('Post operation of migration started',
10109 instance=instance)
10111 # NOTE(tr3buchet): setup networks on destination host
10112 # this is called a second time because
10113 # multi_host does not create the bridge in
10114 # plug_vifs
10115 # NOTE(mriedem): This is a no-op for neutron.
10116 self.network_api.setup_networks_on_host(context, instance,
10117 self.host)
10118 migration = objects.Migration(
10119 source_compute=instance.host,
10120 dest_compute=self.host,
10121 migration_type=fields.MigrationType.LIVE_MIGRATION)
10122 self.network_api.migrate_instance_finish(
10123 context, instance, migration, provider_mappings=None)
10125 network_info = self.network_api.get_instance_nw_info(context, instance)
10126 self._notify_about_instance_usage(
10127 context, instance, "live_migration.post.dest.start",
10128 network_info=network_info)
10129 compute_utils.notify_about_instance_action(context, instance,
10130 self.host,
10131 action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST,
10132 phase=fields.NotificationPhase.START)
10133 block_device_info = self._get_instance_block_device_info(context,
10134 instance)
10135 # Allocate the claimed PCI resources at destination.
10136 self.rt.allocate_pci_devices_for_instance(context, instance)
10138 try:
10139 self.driver.post_live_migration_at_destination(
10140 context, instance, network_info, block_migration,
10141 block_device_info)
10142 except Exception:
10143 with excutils.save_and_reraise_exception():
10144 instance.vm_state = vm_states.ERROR
10145 LOG.error('Unexpected error during post live migration at '
10146 'destination host.', instance=instance)
10147 finally:
10148 # Restore instance state and update host
10149 current_power_state = self._get_power_state(instance)
10150 compute_node = None
10151 node_name = None
10152 prev_host = instance.host
10153 try:
10154 compute_node = self._get_compute_info(context, self.host)
10155 node_name = compute_node.hypervisor_hostname
10156 except exception.ComputeHostNotFound:
10157 LOG.exception('Failed to get compute_info for %s', self.host)
10158 finally:
10159 # NOTE(artom) We need to apply the migration context here
10160 # regardless of whether the driver's
10161 # post_live_migration_at_destination succeeded or not: the
10162 # instance is on the destination, potentially with a new NUMA
10163 # topology and resource usage. We need to persist that.
10164 # NOTE(artom) Apply followed by drop looks weird, but apply
10165 # just saves the new fields while drop actually removes the
10166 # migration context from the instance.
10167 instance.apply_migration_context()
10168 instance.drop_migration_context()
10169 instance.host = self.host
10170 instance.power_state = current_power_state
10171 instance.task_state = None
10172 instance.node = node_name
10173 instance.compute_id = compute_node and compute_node.id or None
10174 instance.progress = 0
10175 instance.save(expected_task_state=task_states.MIGRATING)
10177 # NOTE(tr3buchet): tear down networks on source host (nova-net)
10178 # NOTE(mriedem): For neutron, this will delete any inactive source
10179 # host port bindings.
10180 try:
10181 self.network_api.setup_networks_on_host(context, instance,
10182 prev_host, teardown=True)
10183 except exception.PortBindingDeletionFailed as e:
10184 # Removing the inactive port bindings from the source host is not
10185 # critical so just log an error but don't fail.
10186 LOG.error('Network cleanup failed for source host %s during post '
10187 'live migration. You may need to manually clean up '
10188 'resources in the network service. Error: %s',
10189 prev_host, str(e))
10190 # NOTE(vish): this is necessary to update dhcp for nova-network
10191 # NOTE(mriedem): This is a no-op for neutron.
10192 self.network_api.setup_networks_on_host(context, instance, self.host)
10193 self._notify_about_instance_usage(
10194 context, instance, "live_migration.post.dest.end",
10195 network_info=network_info)
10196 compute_utils.notify_about_instance_action(context, instance,
10197 self.host,
10198 action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST,
10199 phase=fields.NotificationPhase.END)
10201 def _remove_remote_volume_connections(self, context, dest, bdms, instance):
10202 """Rollback remote volume connections on the dest"""
10203 for bdm in bdms:
10204 try:
10205 # remove the connection on the destination host
10206 # NOTE(lyarwood): This actually calls the cinderv2
10207 # os-terminate_connection API if required.
10208 self.compute_rpcapi.remove_volume_connection(
10209 context, instance, bdm.volume_id, dest)
10210 except Exception:
10211 LOG.warning("Ignoring exception while attempting "
10212 "to rollback volume connections for "
10213 "volume %s on host %s.", bdm.volume_id,
10214 dest, instance=instance)
10216 def _rollback_volume_bdms(self, context, bdms, original_bdms, instance):
10217 """Rollback the connection_info and attachment_id for each bdm"""
10218 original_bdms_by_volid = {bdm.volume_id: bdm for bdm in original_bdms
10219 if bdm.is_volume}
10220 for bdm in bdms:
10221 try:
10222 original_bdm = original_bdms_by_volid[bdm.volume_id]
10223 # NOTE(lyarwood): Only delete the referenced attachment if it
10224 # is different to the original in order to avoid accidentally
10225 # removing the source host volume attachment after it has
10226 # already been rolled back by a failure in pre_live_migration.
10227 if (bdm.attachment_id and original_bdm.attachment_id and
10228 bdm.attachment_id != original_bdm.attachment_id):
10229 # NOTE(lyarwood): 3.44 cinder api flow. Delete the
10230 # attachment used by the bdm and reset it to that of
10231 # the original bdm.
10232 self.volume_api.attachment_delete(context,
10233 bdm.attachment_id)
10234 bdm.attachment_id = original_bdm.attachment_id
10235 # NOTE(lyarwood): Reset the connection_info to the original
10236 bdm.connection_info = original_bdm.connection_info
10237 bdm.save()
10238 except cinder_exception.ClientException:
10239 LOG.warning("Ignoring cinderclient exception when "
10240 "attempting to delete attachment %s for volume "
10241 "%s while rolling back volume bdms.",
10242 bdm.attachment_id, bdm.volume_id,
10243 instance=instance)
10244 except Exception:
10245 with excutils.save_and_reraise_exception():
10246 LOG.exception("Exception while attempting to rollback "
10247 "BDM for volume %s.", bdm.volume_id,
10248 instance=instance)
10250 @wrap_exception()
10251 @wrap_instance_fault
10252 def _rollback_live_migration(self, context, instance,
10253 dest, migrate_data=None,
10254 migration_status='failed',
10255 source_bdms=None,
10256 pre_live_migration=False):
10257 """Recovers Instance/volume state from migrating -> running.
10259 :param context: security context
10260 :param instance: nova.objects.instance.Instance object
10261 :param dest:
10262 This method is called from live migration src host.
10263 This param specifies destination host.
10264 :param migrate_data:
10265 if not none, contains implementation specific data.
10266 :param migration_status:
10267 Contains the status we want to set for the migration object
10268 :param source_bdms: BDMs prior to modification by the destination
10269 compute host. Set by _do_live_migration and not
10270 part of the callback interface, so this is never
10271 None
10273 """
10274 # NOTE(gibi): We need to refresh pci_requests of the instance as it
10275 # might be changed by the conductor during scheduling based on the
10276 # selected destination host. If the instance has SRIOV ports with
10277 # resource request then the LiveMigrationTask._find_destination call
10278 # updated the instance.pci_requests.requests[].spec with the SRIOV PF
10279 # device name to be used on the destination host. As the migration is
10280 # rolling back to the source host now we don't want to persist the
10281 # destination host related changes in the DB.
10282 instance.pci_requests = \
10283 objects.InstancePCIRequests.get_by_instance_uuid(
10284 context, instance.uuid)
10286 if (isinstance(migrate_data, migrate_data_obj.LiveMigrateData) and
10287 migrate_data.obj_attr_is_set('migration')):
10288 migration = migrate_data.migration
10289 else:
10290 migration = None
10292 if migration:
10293 # Remove allocations created in Placement for the dest node.
10294 # If migration is None, the virt driver didn't pass it which is
10295 # a bug.
10296 self._revert_allocation(context, instance, migration)
10297 else:
10298 LOG.error('Unable to revert allocations during live migration '
10299 'rollback; compute driver did not provide migrate_data',
10300 instance=instance)
10302 # NOTE(tr3buchet): setup networks on source host (really it's re-setup
10303 # for nova-network)
10304 # NOTE(mriedem): This is a no-op for neutron.
10305 self.network_api.setup_networks_on_host(context, instance, self.host)
10307 # NOTE(erlon): We should make sure that rollback_live_migration_at_src
10308 # is not called in the pre_live_migration rollback as that will trigger
10309 # the src host to re-attach interfaces which were not detached
10310 # previously.
10311 if not pre_live_migration:
10312 self.driver.rollback_live_migration_at_source(context, instance,
10313 migrate_data)
10315 # NOTE(lyarwood): Fetch the current list of BDMs, disconnect any
10316 # connected volumes from the dest and delete any volume attachments
10317 # used by the destination host before rolling back to the original
10318 # still valid source host volume attachments.
10319 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
10320 context, instance.uuid)
10321 # TODO(lyarwood): Turn the following into a lookup method within
10322 # BlockDeviceMappingList.
10323 vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
10324 self._remove_remote_volume_connections(context, dest, vol_bdms,
10325 instance)
10326 self._rollback_volume_bdms(context, vol_bdms, source_bdms, instance)
10328 self._notify_about_instance_usage(context, instance,
10329 "live_migration._rollback.start")
10330 compute_utils.notify_about_instance_action(context, instance,
10331 self.host,
10332 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,
10333 phase=fields.NotificationPhase.START,
10334 bdms=bdms)
10336 do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
10337 migrate_data, migr_ctxt=instance.migration_context)
10339 if do_cleanup:
10340 self.compute_rpcapi.rollback_live_migration_at_destination(
10341 context, instance, dest, destroy_disks=destroy_disks,
10342 migrate_data=migrate_data)
10343 else:
10344 # The port binding profiles need to be cleaned up.
10345 with errors_out_migration_ctxt(migration):
10346 try:
10347 # This call will delete any inactive destination host
10348 # port bindings.
10349 self.network_api.setup_networks_on_host(
10350 context, instance, host=dest, teardown=True)
10351 except exception.PortBindingDeletionFailed as e:
10352 # Removing the inactive port bindings from the destination
10353 # host is not critical so just log an error but don't fail.
10354 LOG.error(
10355 'Network cleanup failed for destination host %s '
10356 'during live migration rollback. You may need to '
10357 'manually clean up resources in the network service. '
10358 'Error: %s', dest, str(e))
10359 except Exception:
10360 with excutils.save_and_reraise_exception():
10361 LOG.exception(
10362 'An error occurred while cleaning up networking '
10363 'during live migration rollback.',
10364 instance=instance)
10366 # NOTE(luyao): We drop move_claim and migration_context after cleanup
10367 # is complete, to ensure the specific resources claimed on destination
10368 # are released safely.
10369 # TODO(artom) drop_move_claim_at_destination() is new in RPC 5.3, only
10370 # call it if we performed a NUMA-aware live migration (which implies us
10371 # being able to send RPC 5.3). To check this, we can use the
10372 # src_supports_numa_live_migration flag, as it will be set if and only
10373 # if:
10374 # - dst_supports_numa_live_migration made its way to the source
10375 # (meaning both dest and source are new and conductor can speak
10376 # RPC 5.3)
10377 # - src_supports_numa_live_migration was set by the source driver and
10378 # passed the send-RPC-5.3 check.
10379 # This check can be removed in RPC 6.0.
10380 if ('src_supports_numa_live_migration' in migrate_data and
10381 migrate_data.src_supports_numa_live_migration):
10382 LOG.debug('Calling destination to drop move claim.',
10383 instance=instance)
10384 self.compute_rpcapi.drop_move_claim_at_destination(context,
10385 instance, dest)
10387 # NOTE(luyao): We only update instance info after rollback operations
10388 # are complete
10389 instance.task_state = None
10390 instance.progress = 0
10391 instance.drop_migration_context()
10392 instance.save(expected_task_state=[task_states.MIGRATING])
10394 self._notify_about_instance_usage(context, instance,
10395 "live_migration._rollback.end")
10396 compute_utils.notify_about_instance_action(context, instance,
10397 self.host,
10398 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,
10399 phase=fields.NotificationPhase.END,
10400 bdms=bdms)
10402 # NOTE(luyao): we have cleanup everything and get instance
10403 # back to normal status, now set migration status to 'failed'
10404 self._set_migration_status(migration, migration_status)
10406 @wrap_exception()
10407 @wrap_instance_fault
10408 def drop_move_claim_at_destination(self, context, instance):
10409 """Called by the source of a live migration during rollback to ask the
10410 destination to drop the MoveClaim object that was created for the live
10411 migration on the destination.
10412 """
10413 nodename = self._get_nodename(instance)
10414 LOG.debug('Dropping live migration resource claim on destination '
10415 'node %s', nodename, instance=instance)
10416 self.rt.drop_move_claim(
10417 context, instance, nodename, flavor=instance.flavor)
10419 @wrap_exception()
10420 @wrap_instance_event(prefix='compute')
10421 @wrap_instance_fault
10422 def rollback_live_migration_at_destination(self, context, instance,
10423 destroy_disks,
10424 migrate_data):
10425 """Cleaning up image directory that is created pre_live_migration.
10427 :param context: security context
10428 :param instance: a nova.objects.instance.Instance object sent over rpc
10429 :param destroy_disks: whether to destroy volumes or not
10430 :param migrate_data: contains migration info
10431 """
10432 network_info = self.network_api.get_instance_nw_info(context, instance)
10433 self._notify_about_instance_usage(
10434 context, instance, "live_migration.rollback.dest.start",
10435 network_info=network_info)
10436 compute_utils.notify_about_instance_action(
10437 context, instance, self.host,
10438 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,
10439 phase=fields.NotificationPhase.START)
10440 try:
10441 # NOTE(tr3buchet): tear down networks on dest host (nova-net)
10442 # NOTE(mriedem): For neutron, this call will delete any
10443 # destination host port bindings.
10444 # TODO(mriedem): We should eventually remove this call from
10445 # this method (rollback_live_migration_at_destination) since this
10446 # method is only called conditionally based on whether or not the
10447 # instance is running on shared storage. _rollback_live_migration
10448 # already calls this method for neutron if we are running on
10449 # shared storage.
10450 self.network_api.setup_networks_on_host(context, instance,
10451 self.host, teardown=True)
10452 except exception.PortBindingDeletionFailed as e:
10453 # Removing the inactive port bindings from the destination
10454 # host is not critical so just log an error but don't fail.
10455 LOG.error(
10456 'Network cleanup failed for destination host %s '
10457 'during live migration rollback. You may need to '
10458 'manually clean up resources in the network service. '
10459 'Error: %s', self.host, str(e))
10460 except Exception:
10461 with excutils.save_and_reraise_exception():
10462 # NOTE(tdurakov): even if teardown networks fails driver
10463 # should try to rollback live migration on destination.
10464 LOG.exception('An error occurred while deallocating network.',
10465 instance=instance)
10466 finally:
10467 # always run this even if setup_networks_on_host fails
10468 # NOTE(vish): The mapping is passed in so the driver can disconnect
10469 # from remote volumes if necessary
10470 block_device_info = self._get_instance_block_device_info(context,
10471 instance)
10472 # free any instance PCI claims done on destination during
10473 # check_can_live_migrate_destination()
10474 self.rt.free_pci_device_claims_for_instance(context, instance)
10476 # NOTE(luyao): Apply migration_context temporarily since it's
10477 # on destination host, we rely on instance object to cleanup
10478 # specific resources like vpmem
10479 with instance.mutated_migration_context():
10480 self.driver.rollback_live_migration_at_destination(
10481 context, instance, network_info, block_device_info,
10482 destroy_disks=destroy_disks, migrate_data=migrate_data)
10484 self._notify_about_instance_usage(
10485 context, instance, "live_migration.rollback.dest.end",
10486 network_info=network_info)
10487 compute_utils.notify_about_instance_action(
10488 context, instance, self.host,
10489 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,
10490 phase=fields.NotificationPhase.END)
10492 def _require_nw_info_update(self, context, instance):
10493 """Detect whether there is a mismatch in binding:host_id, or
10494 binding_failed or unbound binding:vif_type for any of the instances
10495 ports.
10496 """
10497 # Only update port bindings if compute manager does manage port
10498 # bindings instead of the compute driver. For example IronicDriver
10499 # manages the port binding for baremetal instance ports, hence,
10500 # external intervention with the binding is not desired.
10501 if self.driver.manages_network_binding_host_id():
10502 return False
10504 search_opts = {'device_id': instance.uuid,
10505 'fields': ['binding:host_id', 'binding:vif_type']}
10506 ports = self.network_api.list_ports(context, **search_opts)
10507 for p in ports['ports']:
10508 if p.get('binding:host_id') != self.host:
10509 return True
10510 vif_type = p.get('binding:vif_type')
10511 if (vif_type == network_model.VIF_TYPE_UNBOUND or
10512 vif_type == network_model.VIF_TYPE_BINDING_FAILED):
10513 return True
10514 return False
10516 @periodic_task.periodic_task(
10517 spacing= CONF.heal_instance_info_cache_interval
10518 if CONF.heal_instance_info_cache_interval != 0
10519 else -1)
10520 def _heal_instance_info_cache(self, context):
10521 """Called periodically. On every call, try to update the
10522 info_cache's network information for another instance by
10523 calling to the network manager.
10525 This is implemented by keeping a cache of uuids of instances
10526 that live on this host. On each call, we pop one off of a
10527 list, pull the DB record, and try the call to the network API.
10528 If anything errors don't fail, as it's possible the instance
10529 has been deleted, etc.
10530 """
10531 instance_uuids = getattr(self, '_instance_uuids_to_heal', [])
10532 instance = None
10534 LOG.debug('Starting heal instance info cache')
10536 if not instance_uuids:
10537 # The list of instances to heal is empty so rebuild it
10538 LOG.debug('Rebuilding the list of instances to heal')
10539 db_instances = objects.InstanceList.get_by_host(
10540 context, self.host, expected_attrs=[], use_slave=True)
10541 for inst in db_instances:
10542 # We don't want to refresh the cache for instances
10543 # which are building or deleting so don't put them
10544 # in the list. If they are building they will get
10545 # added to the list next time we build it.
10546 if (inst.vm_state == vm_states.BUILDING):
10547 LOG.debug('Skipping network cache update for instance '
10548 'because it is Building.', instance=inst)
10549 continue
10550 if (inst.task_state == task_states.DELETING):
10551 LOG.debug('Skipping network cache update for instance '
10552 'because it is being deleted.', instance=inst)
10553 continue
10555 if not instance:
10556 # Save the first one we find so we don't
10557 # have to get it again
10558 instance = inst
10559 else:
10560 instance_uuids.append(inst['uuid'])
10562 self._instance_uuids_to_heal = instance_uuids
10563 else:
10564 # Find the next valid instance on the list
10565 while instance_uuids: 10565 ↛ 10589line 10565 didn't jump to line 10589 because the condition on line 10565 was always true
10566 try:
10567 inst = objects.Instance.get_by_uuid(
10568 context, instance_uuids.pop(0),
10569 expected_attrs=['system_metadata', 'info_cache',
10570 'flavor'],
10571 use_slave=True)
10572 except exception.InstanceNotFound:
10573 # Instance is gone. Try to grab another.
10574 continue
10576 # Check the instance hasn't been migrated
10577 if inst.host != self.host:
10578 LOG.debug('Skipping network cache update for instance '
10579 'because it has been migrated to another '
10580 'host.', instance=inst)
10581 # Check the instance isn't being deleting
10582 elif inst.task_state == task_states.DELETING:
10583 LOG.debug('Skipping network cache update for instance '
10584 'because it is being deleted.', instance=inst)
10585 else:
10586 instance = inst
10587 break
10589 if instance:
10590 # We have an instance now to refresh
10591 try:
10592 # Fix potential mismatch in port binding if evacuation failed
10593 # after reassigning the port binding to the dest host but
10594 # before the instance host is changed.
10595 # Do this only when instance has no pending task.
10596 if instance.task_state is None and \
10597 self._require_nw_info_update(context, instance):
10598 LOG.info("Updating ports in neutron", instance=instance)
10599 self.network_api.setup_instance_network_on_host(
10600 context, instance, self.host)
10601 # Call to network API to get instance info.. this will
10602 # force an update to the instance's info_cache
10603 self.network_api.get_instance_nw_info(
10604 context, instance, force_refresh=True)
10605 LOG.debug('Updated the network info_cache for instance',
10606 instance=instance)
10607 except exception.InstanceNotFound:
10608 # Instance is gone.
10609 LOG.debug('Instance no longer exists. Unable to refresh',
10610 instance=instance)
10611 return
10612 except exception.InstanceInfoCacheNotFound:
10613 # InstanceInfoCache is gone.
10614 LOG.debug('InstanceInfoCache no longer exists. '
10615 'Unable to refresh', instance=instance)
10616 except Exception:
10617 LOG.error('An error occurred while refreshing the network '
10618 'cache.', instance=instance, exc_info=True)
10619 else:
10620 LOG.debug("Didn't find any instances for network info cache "
10621 "update.")
10623 @periodic_task.periodic_task
10624 def _poll_rebooting_instances(self, context):
10625 if CONF.reboot_timeout > 0:
10626 filters = {'task_state':
10627 [task_states.REBOOTING,
10628 task_states.REBOOT_STARTED,
10629 task_states.REBOOT_PENDING],
10630 'host': self.host}
10631 rebooting = objects.InstanceList.get_by_filters(
10632 context, filters, expected_attrs=[], use_slave=True)
10634 to_poll = []
10635 for instance in rebooting:
10636 if timeutils.is_older_than(instance.updated_at, 10636 ↛ 10635line 10636 didn't jump to line 10635 because the condition on line 10636 was always true
10637 CONF.reboot_timeout):
10638 to_poll.append(instance)
10640 self.driver.poll_rebooting_instances(CONF.reboot_timeout, to_poll)
10642 @periodic_task.periodic_task
10643 def _poll_rescued_instances(self, context):
10644 if CONF.rescue_timeout > 0:
10645 filters = {'vm_state': vm_states.RESCUED,
10646 'host': self.host}
10647 rescued_instances = objects.InstanceList.get_by_filters(
10648 context, filters, expected_attrs=["system_metadata"],
10649 use_slave=True)
10651 to_unrescue = []
10652 for instance in rescued_instances:
10653 if timeutils.is_older_than(instance.launched_at,
10654 CONF.rescue_timeout):
10655 to_unrescue.append(instance)
10657 for instance in to_unrescue:
10658 self.compute_api.unrescue(context, instance)
10660 @periodic_task.periodic_task
10661 def _poll_unconfirmed_resizes(self, context):
10662 if CONF.resize_confirm_window == 0:
10663 return
10665 migrations = objects.MigrationList.get_unconfirmed_by_dest_compute(
10666 context, CONF.resize_confirm_window, self.host,
10667 use_slave=True)
10669 migrations_info = dict(migration_count=len(migrations),
10670 confirm_window=CONF.resize_confirm_window)
10672 if migrations_info["migration_count"] > 0: 10672 ↛ 10677line 10672 didn't jump to line 10677 because the condition on line 10672 was always true
10673 LOG.info("Found %(migration_count)d unconfirmed migrations "
10674 "older than %(confirm_window)d seconds",
10675 migrations_info)
10677 def _set_migration_to_error(migration, reason, **kwargs):
10678 LOG.warning("Setting migration %(migration_id)s to error: "
10679 "%(reason)s",
10680 {'migration_id': migration.id, 'reason': reason},
10681 **kwargs)
10682 migration.status = 'error'
10683 migration.save()
10685 for migration in migrations:
10686 instance_uuid = migration.instance_uuid
10687 LOG.info("Automatically confirming migration "
10688 "%(migration_id)s for instance %(instance_uuid)s",
10689 {'migration_id': migration.id,
10690 'instance_uuid': instance_uuid})
10691 expected_attrs = ['metadata', 'system_metadata']
10692 try:
10693 instance = objects.Instance.get_by_uuid(context,
10694 instance_uuid, expected_attrs=expected_attrs,
10695 use_slave=True)
10696 except exception.InstanceNotFound:
10697 reason = (_("Instance %s not found") %
10698 instance_uuid)
10699 _set_migration_to_error(migration, reason)
10700 continue
10701 if instance.vm_state == vm_states.ERROR:
10702 reason = _("In ERROR state")
10703 _set_migration_to_error(migration, reason,
10704 instance=instance)
10705 continue
10706 # race condition: The instance in DELETING state should not be
10707 # set the migration state to error, otherwise the instance in
10708 # to be deleted which is in RESIZED state
10709 # will not be able to confirm resize
10710 if instance.task_state in [task_states.DELETING,
10711 task_states.SOFT_DELETING]:
10712 msg = ("Instance being deleted or soft deleted during resize "
10713 "confirmation. Skipping.")
10714 LOG.debug(msg, instance=instance)
10715 continue
10717 # race condition: This condition is hit when this method is
10718 # called between the save of the migration record with a status of
10719 # finished and the save of the instance object with a state of
10720 # RESIZED. The migration record should not be set to error.
10721 if instance.task_state == task_states.RESIZE_FINISH:
10722 msg = ("Instance still resizing during resize "
10723 "confirmation. Skipping.")
10724 LOG.debug(msg, instance=instance)
10725 continue
10727 vm_state = instance.vm_state
10728 task_state = instance.task_state
10729 if vm_state != vm_states.RESIZED or task_state is not None:
10730 reason = (_("In states %(vm_state)s/%(task_state)s, not "
10731 "RESIZED/None") %
10732 {'vm_state': vm_state,
10733 'task_state': task_state})
10734 _set_migration_to_error(migration, reason,
10735 instance=instance)
10736 continue
10737 try:
10738 self.compute_api.confirm_resize(context, instance,
10739 migration=migration)
10740 except Exception as e:
10741 LOG.info("Error auto-confirming resize: %s. "
10742 "Will retry later.", e, instance=instance)
10744 @periodic_task.periodic_task(spacing=CONF.shelved_poll_interval)
10745 def _poll_shelved_instances(self, context):
10747 if CONF.shelved_offload_time <= 0:
10748 return
10750 filters = {'vm_state': vm_states.SHELVED,
10751 'task_state': None,
10752 'host': self.host}
10753 shelved_instances = objects.InstanceList.get_by_filters(
10754 context, filters=filters, expected_attrs=['system_metadata'],
10755 use_slave=True)
10757 to_gc = []
10758 for instance in shelved_instances:
10759 sys_meta = instance.system_metadata
10760 shelved_at = timeutils.parse_strtime(sys_meta['shelved_at'])
10761 if timeutils.is_older_than(shelved_at, CONF.shelved_offload_time):
10762 to_gc.append(instance)
10764 cyclient = cyborg.get_client(context)
10765 for instance in to_gc:
10766 try:
10767 instance.task_state = task_states.SHELVING_OFFLOADING
10768 instance.save(expected_task_state=(None,))
10769 accel_uuids = []
10770 if instance.flavor.extra_specs.get('accel:device_profile'):
10771 # TODO(brinzhang): After cyborg support batch query ARQs
10772 # for more than one instances, we will improve efficiency
10773 # with this implementation.
10774 accel_uuids = cyclient.get_arq_uuids_for_instance(instance)
10775 self.shelve_offload_instance(
10776 context, instance, clean_shutdown=False,
10777 accel_uuids=accel_uuids)
10778 except Exception:
10779 LOG.exception('Periodic task failed to offload instance.',
10780 instance=instance)
10782 @periodic_task.periodic_task
10783 def _instance_usage_audit(self, context):
10784 if not CONF.instance_usage_audit:
10785 return
10787 begin, end = utils.last_completed_audit_period()
10788 if objects.TaskLog.get(context, 'instance_usage_audit', begin, end, 10788 ↛ 10790line 10788 didn't jump to line 10790 because the condition on line 10788 was never true
10789 self.host):
10790 return
10792 instances = objects.InstanceList.get_active_by_window_joined(
10793 context, begin, end, host=self.host,
10794 expected_attrs=['system_metadata', 'info_cache', 'metadata',
10795 'flavor'],
10796 use_slave=True)
10797 num_instances = len(instances)
10798 errors = 0
10799 successes = 0
10800 LOG.info("Running instance usage audit for host %(host)s "
10801 "from %(begin_time)s to %(end_time)s. "
10802 "%(number_instances)s instances.",
10803 {'host': self.host,
10804 'begin_time': begin,
10805 'end_time': end,
10806 'number_instances': num_instances})
10807 start_time = time.time()
10808 task_log = objects.TaskLog(context)
10809 task_log.task_name = 'instance_usage_audit'
10810 task_log.period_beginning = begin
10811 task_log.period_ending = end
10812 task_log.host = self.host
10813 task_log.task_items = num_instances
10814 task_log.message = 'Instance usage audit started...'
10815 task_log.begin_task()
10816 for instance in instances:
10817 try:
10818 compute_utils.notify_usage_exists(
10819 self.notifier, context, instance, self.host,
10820 ignore_missing_network_data=False)
10821 successes += 1
10822 except Exception:
10823 LOG.exception('Failed to generate usage '
10824 'audit for instance '
10825 'on host %s', self.host,
10826 instance=instance)
10827 errors += 1
10828 task_log.errors = errors
10829 task_log.message = (
10830 'Instance usage audit ran for host %s, %s instances in %s seconds.'
10831 % (self.host, num_instances, time.time() - start_time))
10832 task_log.end_task()
10834 def _get_host_volume_bdms(self, context, use_slave=False):
10835 """Return all block device mappings on a compute host."""
10836 compute_host_bdms = []
10837 instances = objects.InstanceList.get_by_host(context, self.host,
10838 use_slave=use_slave)
10839 for instance in instances:
10840 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
10841 context, instance.uuid, use_slave=use_slave)
10842 instance_bdms = [bdm for bdm in bdms if bdm.is_volume]
10843 compute_host_bdms.append(dict(instance=instance,
10844 instance_bdms=instance_bdms))
10846 return compute_host_bdms
10848 def _update_volume_usage_cache(self, context, vol_usages):
10849 """Updates the volume usage cache table with a list of stats."""
10850 for usage in vol_usages:
10851 # Allow switching of greenthreads between queries.
10852 greenthread.sleep(0)
10853 vol_usage = objects.VolumeUsage(context)
10854 vol_usage.volume_id = usage['volume']
10855 vol_usage.instance_uuid = usage['instance'].uuid
10856 vol_usage.project_id = usage['instance'].project_id
10857 vol_usage.user_id = usage['instance'].user_id
10858 vol_usage.availability_zone = usage['instance'].availability_zone
10859 vol_usage.curr_reads = usage['rd_req']
10860 vol_usage.curr_read_bytes = usage['rd_bytes']
10861 vol_usage.curr_writes = usage['wr_req']
10862 vol_usage.curr_write_bytes = usage['wr_bytes']
10863 vol_usage.save()
10864 self.notifier.info(context, 'volume.usage', vol_usage.to_dict())
10865 compute_utils.notify_about_volume_usage(context, vol_usage,
10866 self.host)
10868 @periodic_task.periodic_task(spacing=CONF.volume_usage_poll_interval)
10869 def _poll_volume_usage(self, context):
10870 if CONF.volume_usage_poll_interval == 0:
10871 return
10873 compute_host_bdms = self._get_host_volume_bdms(context,
10874 use_slave=True)
10875 if not compute_host_bdms:
10876 return
10878 LOG.debug("Updating volume usage cache")
10879 try:
10880 vol_usages = self.driver.get_all_volume_usage(context,
10881 compute_host_bdms)
10882 except NotImplementedError:
10883 return
10885 self._update_volume_usage_cache(context, vol_usages)
10887 @periodic_task.periodic_task(spacing=CONF.sync_power_state_interval,
10888 run_immediately=True)
10889 def _sync_power_states(self, context):
10890 """Align power states between the database and the hypervisor.
10892 To sync power state data we make a DB call to get the number of
10893 virtual machines known by the hypervisor and if the number matches the
10894 number of virtual machines known by the database, we proceed in a lazy
10895 loop, one database record at a time, checking if the hypervisor has the
10896 same power state as is in the database.
10897 """
10898 db_instances = objects.InstanceList.get_by_host(context, self.host,
10899 expected_attrs=[],
10900 use_slave=True)
10902 try:
10903 num_vm_instances = self.driver.get_num_instances()
10904 except exception.VirtDriverNotReady as e:
10905 # If the virt driver is not ready, like ironic-api not being up
10906 # yet in the case of ironic, just log it and exit.
10907 LOG.info('Skipping _sync_power_states periodic task due to: %s', e)
10908 return
10910 num_db_instances = len(db_instances)
10912 if num_vm_instances != num_db_instances:
10913 LOG.warning("While synchronizing instance power states, found "
10914 "%(num_db_instances)s instances in the database "
10915 "and %(num_vm_instances)s instances on the "
10916 "hypervisor.",
10917 {'num_db_instances': num_db_instances,
10918 'num_vm_instances': num_vm_instances})
10920 def _sync(db_instance):
10921 # NOTE(melwitt): This must be synchronized as we query state from
10922 # two separate sources, the driver and the database.
10923 # They are set (in stop_instance) and read, in sync.
10924 @utils.synchronized(db_instance.uuid)
10925 def query_driver_power_state_and_sync():
10926 self._query_driver_power_state_and_sync(context, db_instance)
10928 try:
10929 query_driver_power_state_and_sync()
10930 except Exception:
10931 LOG.exception("Periodic sync_power_state task had an "
10932 "error while processing an instance.",
10933 instance=db_instance)
10935 self._syncs_in_progress.pop(db_instance.uuid)
10937 for db_instance in db_instances:
10938 # process syncs asynchronously - don't want instance locking to
10939 # block entire periodic task thread
10940 uuid = db_instance.uuid
10941 if uuid in self._syncs_in_progress: 10941 ↛ 10942line 10941 didn't jump to line 10942 because the condition on line 10941 was never true
10942 LOG.debug('Sync already in progress for %s', uuid)
10943 else:
10944 LOG.debug('Triggering sync for uuid %s', uuid)
10945 self._syncs_in_progress[uuid] = True
10946 nova.utils.pass_context(self._sync_power_pool.spawn_n,
10947 _sync,
10948 db_instance)
10950 def _query_driver_power_state_and_sync(self, context, db_instance):
10951 if db_instance.task_state is not None:
10952 LOG.info("During sync_power_state the instance has a "
10953 "pending task (%(task)s). Skip.",
10954 {'task': db_instance.task_state}, instance=db_instance)
10955 return
10956 # No pending tasks. Now try to figure out the real vm_power_state.
10957 try:
10958 vm_instance = self.driver.get_info(db_instance)
10959 vm_power_state = vm_instance.state
10960 except exception.InstanceNotFound:
10961 vm_power_state = power_state.NOSTATE
10962 # Note(maoy): the above get_info call might take a long time,
10963 # for example, because of a broken libvirt driver.
10964 try:
10965 self._sync_instance_power_state(context,
10966 db_instance,
10967 vm_power_state,
10968 use_slave=True)
10969 except exception.InstanceNotFound:
10970 # NOTE(hanlind): If the instance gets deleted during sync,
10971 # silently ignore.
10972 pass
10974 def _stop_unexpected_shutdown_instance(self, context, vm_state,
10975 db_instance, orig_db_power_state):
10976 # this is an exceptional case; make sure our data is up
10977 # to date before slamming through a power off
10978 vm_instance = self.driver.get_info(db_instance,
10979 use_cache=False)
10980 vm_power_state = vm_instance.state
10982 # if it still looks off, go ahead and call stop()
10983 if vm_power_state in (power_state.SHUTDOWN, 10983 ↛ exitline 10983 didn't return from function '_stop_unexpected_shutdown_instance' because the condition on line 10983 was always true
10984 power_state.CRASHED):
10986 LOG.warning("Instance shutdown by itself. Calling the "
10987 "stop API. Current vm_state: %(vm_state)s, "
10988 "current task_state: %(task_state)s, "
10989 "original DB power_state: %(db_power_state)s, "
10990 "current VM power_state: %(vm_power_state)s",
10991 {'vm_state': vm_state,
10992 'task_state': db_instance.task_state,
10993 'db_power_state': orig_db_power_state,
10994 'vm_power_state': vm_power_state},
10995 instance=db_instance)
10996 try:
10997 # Note(maoy): here we call the API instead of
10998 # brutally updating the vm_state in the database
10999 # to allow all the hooks and checks to be performed.
11000 if db_instance.shutdown_terminate:
11001 self.compute_api.delete(context, db_instance)
11002 else:
11003 self.compute_api.stop(context, db_instance)
11004 except Exception:
11005 # Note(maoy): there is no need to propagate the error
11006 # because the same power_state will be retrieved next
11007 # time and retried.
11008 # For example, there might be another task scheduled.
11009 LOG.exception("error during stop() in sync_power_state.",
11010 instance=db_instance)
11012 def _sync_instance_power_state(self, context, db_instance, vm_power_state,
11013 use_slave=False):
11014 """Align instance power state between the database and hypervisor.
11016 If the instance is not found on the hypervisor, but is in the database,
11017 then a stop() API will be called on the instance.
11018 """
11020 # We re-query the DB to get the latest instance info to minimize
11021 # (not eliminate) race condition.
11022 db_instance.refresh(use_slave=use_slave)
11023 db_power_state = db_instance.power_state
11024 vm_state = db_instance.vm_state
11026 if self.host != db_instance.host: 11026 ↛ 11034line 11026 didn't jump to line 11034 because the condition on line 11026 was never true
11027 # on the sending end of nova-compute _sync_power_state
11028 # may have yielded to the greenthread performing a live
11029 # migration; this in turn has changed the resident-host
11030 # for the VM; However, the instance is still active, it
11031 # is just in the process of migrating to another host.
11032 # This implies that the compute source must relinquish
11033 # control to the compute destination.
11034 LOG.info("During the sync_power process the "
11035 "instance has moved from "
11036 "host %(src)s to host %(dst)s",
11037 {'src': db_instance.host,
11038 'dst': self.host},
11039 instance=db_instance)
11040 return
11041 elif db_instance.task_state is not None: 11041 ↛ 11047line 11041 didn't jump to line 11047 because the condition on line 11041 was never true
11042 # on the receiving end of nova-compute, it could happen
11043 # that the DB instance already report the new resident
11044 # but the actual VM has not showed up on the hypervisor
11045 # yet. In this case, let's allow the loop to continue
11046 # and run the state sync in a later round
11047 LOG.info("During sync_power_state the instance has a "
11048 "pending task (%(task)s). Skip.",
11049 {'task': db_instance.task_state},
11050 instance=db_instance)
11051 return
11053 orig_db_power_state = db_power_state
11054 if vm_power_state != db_power_state:
11055 LOG.info('During _sync_instance_power_state the DB '
11056 'power_state (%(db_power_state)s) does not match '
11057 'the vm_power_state from the hypervisor '
11058 '(%(vm_power_state)s). Updating power_state in the '
11059 'DB to match the hypervisor.',
11060 {'db_power_state': db_power_state,
11061 'vm_power_state': vm_power_state},
11062 instance=db_instance)
11063 # power_state is always updated from hypervisor to db
11064 db_instance.power_state = vm_power_state
11065 db_instance.save()
11066 db_power_state = vm_power_state
11068 # Note(maoy): Now resolve the discrepancy between vm_state and
11069 # vm_power_state. We go through all possible vm_states.
11070 if vm_state in (vm_states.BUILDING, 11070 ↛ 11076line 11070 didn't jump to line 11076 because the condition on line 11070 was never true
11071 vm_states.RESCUED,
11072 vm_states.RESIZED,
11073 vm_states.SUSPENDED,
11074 vm_states.ERROR):
11075 # TODO(maoy): we ignore these vm_state for now.
11076 pass
11077 elif vm_state == vm_states.ACTIVE:
11078 # The only rational power state should be RUNNING
11079 if vm_power_state in (power_state.SHUTDOWN,
11080 power_state.CRASHED):
11081 self._stop_unexpected_shutdown_instance(
11082 context, vm_state, db_instance, orig_db_power_state)
11083 elif vm_power_state == power_state.SUSPENDED:
11084 LOG.warning("Instance is suspended unexpectedly. Calling "
11085 "the stop API.", instance=db_instance)
11086 try:
11087 self.compute_api.stop(context, db_instance)
11088 except Exception:
11089 LOG.exception("error during stop() in sync_power_state.",
11090 instance=db_instance)
11091 elif vm_power_state == power_state.PAUSED:
11092 # Note(maoy): a VM may get into the paused state not only
11093 # because the user request via API calls, but also
11094 # due to (temporary) external instrumentations.
11095 # Before the virt layer can reliably report the reason,
11096 # we simply ignore the state discrepancy. In many cases,
11097 # the VM state will go back to running after the external
11098 # instrumentation is done. See bug 1097806 for details.
11099 LOG.warning("Instance is paused unexpectedly. Ignore.",
11100 instance=db_instance)
11101 elif vm_power_state == power_state.NOSTATE:
11102 # Occasionally, depending on the status of the hypervisor,
11103 # which could be restarting for example, an instance may
11104 # not be found. Therefore just log the condition.
11105 LOG.warning("Instance is unexpectedly not found. Ignore.",
11106 instance=db_instance)
11107 elif vm_state == vm_states.STOPPED:
11108 if vm_power_state not in (power_state.NOSTATE, 11108 ↛ exitline 11108 didn't return from function '_sync_instance_power_state' because the condition on line 11108 was always true
11109 power_state.SHUTDOWN,
11110 power_state.CRASHED):
11111 LOG.warning("Instance is not stopped. Calling "
11112 "the stop API. Current vm_state: %(vm_state)s,"
11113 " current task_state: %(task_state)s, "
11114 "original DB power_state: %(db_power_state)s, "
11115 "current VM power_state: %(vm_power_state)s",
11116 {'vm_state': vm_state,
11117 'task_state': db_instance.task_state,
11118 'db_power_state': orig_db_power_state,
11119 'vm_power_state': vm_power_state},
11120 instance=db_instance)
11121 try:
11122 # NOTE(russellb) Force the stop, because normally the
11123 # compute API would not allow an attempt to stop a stopped
11124 # instance.
11125 self.compute_api.force_stop(context, db_instance)
11126 except Exception:
11127 LOG.exception("error during stop() in sync_power_state.",
11128 instance=db_instance)
11129 elif vm_state == vm_states.PAUSED:
11130 if vm_power_state in (power_state.SHUTDOWN, 11130 ↛ exitline 11130 didn't return from function '_sync_instance_power_state' because the condition on line 11130 was always true
11131 power_state.CRASHED):
11132 LOG.warning("Paused instance shutdown by itself. Calling "
11133 "the stop API.", instance=db_instance)
11134 try:
11135 self.compute_api.force_stop(context, db_instance)
11136 except Exception:
11137 LOG.exception("error during stop() in sync_power_state.",
11138 instance=db_instance)
11139 elif vm_state in (vm_states.SOFT_DELETED, 11139 ↛ exitline 11139 didn't return from function '_sync_instance_power_state' because the condition on line 11139 was always true
11140 vm_states.DELETED):
11141 if vm_power_state not in (power_state.NOSTATE, 11141 ↛ 11145line 11141 didn't jump to line 11145 because the condition on line 11141 was never true
11142 power_state.SHUTDOWN):
11143 # Note(maoy): this should be taken care of periodically in
11144 # _cleanup_running_deleted_instances().
11145 LOG.warning("Instance is not (soft-)deleted.",
11146 instance=db_instance)
11148 @periodic_task.periodic_task
11149 def _reclaim_queued_deletes(self, context):
11150 """Reclaim instances that are queued for deletion."""
11151 interval = CONF.reclaim_instance_interval
11152 if interval <= 0:
11153 LOG.debug("CONF.reclaim_instance_interval <= 0, skipping...")
11154 return
11156 filters = {'vm_state': vm_states.SOFT_DELETED,
11157 'task_state': None,
11158 'host': self.host}
11159 instances = objects.InstanceList.get_by_filters(
11160 context, filters,
11161 expected_attrs=objects.instance.INSTANCE_DEFAULT_FIELDS,
11162 use_slave=True)
11163 for instance in instances:
11164 if self._deleted_old_enough(instance, interval):
11165 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
11166 context, instance.uuid)
11167 LOG.info('Reclaiming deleted instance', instance=instance)
11168 try:
11169 self._delete_instance(context, instance, bdms)
11170 except Exception as e:
11171 LOG.warning("Periodic reclaim failed to delete "
11172 "instance: %s",
11173 e, instance=instance)
11175 def _get_nodename(self, instance, refresh=False):
11176 """Helper method to get the name of the first available node
11177 on this host. This method should not be used with any operations
11178 on ironic instances since it does not handle multiple nodes.
11179 """
11180 node = self.driver.get_available_nodes(refresh=refresh)[0]
11181 LOG.debug("No node specified, defaulting to %s", node,
11182 instance=instance)
11183 return node
11185 def _update_available_resource_for_node(self, context, nodename,
11186 startup=False):
11188 try:
11189 self.rt.update_available_resource(context, nodename,
11190 startup=startup)
11191 except exception.ComputeHostNotFound:
11192 LOG.warning("Compute node '%s' not found in "
11193 "update_available_resource.", nodename)
11194 except exception.ReshapeFailed:
11195 # We're only supposed to get here on startup, if a reshape was
11196 # needed, was attempted, and failed. We want to kill the service.
11197 with excutils.save_and_reraise_exception():
11198 LOG.critical("Resource provider data migration failed "
11199 "fatally during startup for node %s.", nodename)
11200 except exception.ReshapeNeeded:
11201 # This exception should only find its way here if the virt driver's
11202 # update_provider_tree raised it incorrectly: either
11203 # a) After the resource tracker already caught it once and
11204 # reinvoked update_provider_tree with allocations. At this point
11205 # the driver is just supposed to *do* the reshape, so if it raises
11206 # ReshapeNeeded, it's a bug, and we want to kill the compute
11207 # service.
11208 # b) On periodic rather than startup (we only allow reshapes to
11209 # happen on startup). In this case we'll just make the logs red and
11210 # go again at the next periodic interval, where the same thing may
11211 # or may not happen again. Depending on the previous and intended
11212 # shape of the providers/inventories, this may not actually cause
11213 # any immediately visible symptoms (in terms of scheduling, etc.)
11214 # If this becomes a problem, we may wish to make it pop immediately
11215 # (e.g. disable the service).
11216 with excutils.save_and_reraise_exception():
11217 LOG.exception("ReshapeNeeded exception is unexpected here!")
11218 except exception.PlacementPciException:
11219 # If we are at startup and the Placement PCI inventory handling
11220 # failed then probably there is a configuration error. Propagate
11221 # the error up to kill the service.
11222 if startup:
11223 raise
11224 # If we are not at startup then we can assume that the
11225 # configuration was correct at startup so the error is probably
11226 # transient. Anyhow we cannot kill the service any more so just
11227 # log the error and continue.
11228 LOG.exception(
11229 "Error updating PCI resources for node %(node)s.",
11230 {'node': nodename})
11231 except exception.InvalidConfiguration as e:
11232 if startup:
11233 # If this happens during startup, we need to let it raise to
11234 # abort our service startup.
11235 raise
11236 else:
11237 LOG.error("Error updating resources for node %s: %s",
11238 nodename, e)
11239 except Exception:
11240 LOG.exception("Error updating resources for node %(node)s.",
11241 {'node': nodename})
11243 @periodic_task.periodic_task(spacing=CONF.update_resources_interval)
11244 def update_available_resource(self, context, startup=False):
11245 """See driver.get_available_resource()
11247 Periodic process that keeps that the compute host's understanding of
11248 resource availability and usage in sync with the underlying hypervisor.
11250 :param context: security context
11251 :param startup: True if this is being called when the nova-compute
11252 service is starting, False otherwise.
11253 """
11254 try:
11255 nodenames = set(self.driver.get_available_nodes())
11256 except exception.VirtDriverNotReady:
11257 LOG.warning("Virt driver is not ready.")
11258 return
11260 compute_nodes_in_db = self._get_compute_nodes_in_db(context,
11261 nodenames,
11262 use_slave=True,
11263 startup=startup)
11265 self.rt.clean_compute_node_cache(compute_nodes_in_db)
11267 # Delete orphan compute node not reported by driver but still in db
11268 for cn in compute_nodes_in_db:
11269 if cn.hypervisor_hostname not in nodenames:
11270 # if the node could be migrated, we don't delete
11271 # the compute node database records
11272 if not self.driver.is_node_deleted(cn.hypervisor_hostname):
11273 LOG.warning(
11274 "Found orphan compute node %(id)s "
11275 "hypervisor host is %(hh)s, "
11276 "nodes are %(nodes)s. "
11277 "We are not deleting this as the driver "
11278 "says this node has not been deleted.",
11279 {'id': cn.id, 'hh': cn.hypervisor_hostname,
11280 'nodes': nodenames})
11281 continue
11283 LOG.info("Deleting orphan compute node %(id)s "
11284 "hypervisor host is %(hh)s, "
11285 "nodes are %(nodes)s",
11286 {'id': cn.id, 'hh': cn.hypervisor_hostname,
11287 'nodes': nodenames})
11288 try:
11289 cn.destroy()
11290 except exception.ObjectActionError:
11291 # NOTE(mgoddard): it's possible that another compute
11292 # service took ownership of this compute node since we
11293 # queried it due to a rebalance, and this will cause the
11294 # deletion to fail. Ignore the error in that case.
11295 LOG.info("Ignoring failure to delete orphan compute node "
11296 "%(id)s on hypervisor host %(hh)s due to "
11297 "possible node rebalance",
11298 {'id': cn.id, 'hh': cn.hypervisor_hostname})
11299 self.rt.remove_node(cn.hypervisor_hostname)
11300 self.reportclient.invalidate_resource_provider(cn.uuid)
11301 else:
11302 self.rt.remove_node(cn.hypervisor_hostname)
11303 # Delete the corresponding resource provider in placement,
11304 # along with any associated allocations.
11305 try:
11306 self.reportclient.delete_resource_provider(
11307 context, cn, cascade=True)
11308 except keystone_exception.ClientException as e:
11309 LOG.error(
11310 "Failed to delete compute node resource provider "
11311 "for compute node %s: %s", cn.uuid, str(e))
11313 for nodename in nodenames:
11314 self._update_available_resource_for_node(context, nodename,
11315 startup=startup)
11317 def _get_compute_nodes_in_db(self, context, nodenames, use_slave=False,
11318 startup=False):
11319 try:
11320 return objects.ComputeNodeList.get_all_by_host(context, self.host,
11321 use_slave=use_slave)
11322 except exception.NotFound:
11323 # If the driver is not reporting any nodenames we should not
11324 # expect there to be compute nodes so we just return in that case.
11325 # For example, this could be an ironic compute and it is not
11326 # managing any nodes yet.
11327 if nodenames:
11328 if startup: 11328 ↛ 11334line 11328 didn't jump to line 11334 because the condition on line 11328 was always true
11329 LOG.warning(
11330 "No compute node record found for host %s. If this is "
11331 "the first time this service is starting on this "
11332 "host, then you can ignore this warning.", self.host)
11333 else:
11334 LOG.error("No compute node record for host %s", self.host)
11335 return []
11337 @periodic_task.periodic_task(
11338 spacing=CONF.running_deleted_instance_poll_interval,
11339 run_immediately=True)
11340 def _cleanup_running_deleted_instances(self, context):
11341 """Cleanup any instances which are erroneously still running after
11342 having been deleted.
11344 Valid actions to take are:
11346 1. noop - do nothing
11347 2. log - log which instances are erroneously running
11348 3. reap - shutdown and cleanup any erroneously running instances
11349 4. shutdown - power off *and disable* any erroneously running
11350 instances
11352 The use-case for this cleanup task is: for various reasons, it may be
11353 possible for the database to show an instance as deleted but for that
11354 instance to still be running on a host machine (see bug
11355 https://bugs.launchpad.net/nova/+bug/911366).
11357 This cleanup task is a cross-hypervisor utility for finding these
11358 zombied instances and either logging the discrepancy (likely what you
11359 should do in production), or automatically reaping the instances (more
11360 appropriate for dev environments).
11361 """
11362 action = CONF.running_deleted_instance_action
11364 if action == "noop": 11364 ↛ 11365line 11364 didn't jump to line 11365 because the condition on line 11364 was never true
11365 return
11367 # NOTE(sirp): admin contexts don't ordinarily return deleted records
11368 with utils.temporary_mutation(context, read_deleted="yes"):
11370 try:
11371 instances = self._running_deleted_instances(context)
11372 except exception.VirtDriverNotReady:
11373 # Since this task runs immediately on startup, if the
11374 # hypervisor is not yet ready handle it gracefully.
11375 LOG.debug('Unable to check for running deleted instances '
11376 'at this time since the hypervisor is not ready.')
11377 return
11379 for instance in instances:
11380 if action == "log": 11380 ↛ 11381line 11380 didn't jump to line 11381 because the condition on line 11380 was never true
11381 LOG.warning("Detected instance with name label "
11382 "'%s' which is marked as "
11383 "DELETED but still present on host.",
11384 instance.name, instance=instance)
11386 elif action == 'shutdown':
11387 LOG.info("Powering off instance with name label "
11388 "'%s' which is marked as "
11389 "DELETED but still present on host.",
11390 instance.name, instance=instance)
11391 try:
11392 self.driver.power_off(context, instance)
11393 except Exception:
11394 LOG.warning("Failed to power off instance",
11395 instance=instance, exc_info=True)
11397 elif action == 'reap': 11397 ↛ 11415line 11397 didn't jump to line 11415 because the condition on line 11397 was always true
11398 LOG.info("Destroying instance with name label "
11399 "'%s' which is marked as "
11400 "DELETED but still present on host.",
11401 instance.name, instance=instance)
11402 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
11403 context, instance.uuid, use_slave=True)
11404 self.instance_events.clear_events_for_instance(instance)
11405 try:
11406 self._shutdown_instance(context, instance, bdms,
11407 notify=False)
11408 self._cleanup_volumes(context, instance, bdms,
11409 detach=False)
11410 except Exception as e:
11411 LOG.warning("Periodic cleanup failed to delete "
11412 "instance: %s",
11413 e, instance=instance)
11414 else:
11415 raise Exception(_("Unrecognized value '%s'"
11416 " for CONF.running_deleted_"
11417 "instance_action") % action)
11419 def _running_deleted_instances(self, context):
11420 """Returns a list of instances nova thinks is deleted,
11421 but the hypervisor thinks is still running.
11422 """
11423 timeout = CONF.running_deleted_instance_timeout
11424 filters = {'deleted': True,
11425 'soft_deleted': False}
11426 instances = self._get_instances_on_driver(context, filters)
11427 return [i for i in instances if self._deleted_old_enough(i, timeout)]
11429 def _deleted_old_enough(self, instance, timeout):
11430 deleted_at = instance.deleted_at
11431 if deleted_at: 11431 ↛ 11433line 11431 didn't jump to line 11433 because the condition on line 11431 was always true
11432 deleted_at = deleted_at.replace(tzinfo=None)
11433 return (not deleted_at or timeutils.is_older_than(deleted_at, timeout))
11435 @contextlib.contextmanager
11436 def _error_out_instance_on_exception(self, context, instance,
11437 instance_state=vm_states.ACTIVE):
11438 """Context manager to set instance.vm_state after some operation raises
11440 Used to handle NotImplementedError and InstanceFaultRollback errors
11441 and reset the instance vm_state and task_state. The vm_state is set
11442 to the $instance_state parameter and task_state is set to None.
11443 For all other types of exceptions, the vm_state is set to ERROR and
11444 the task_state is left unchanged (although most callers will have the
11445 @reverts_task_state decorator which will set the task_state to None).
11447 Re-raises the original exception *except* in the case of
11448 InstanceFaultRollback in which case the wrapped `inner_exception` is
11449 re-raised.
11451 :param context: The nova auth request context for the operation.
11452 :param instance: The instance to update. The vm_state will be set by
11453 this context manager when an exception is raised.
11454 :param instance_state: For NotImplementedError and
11455 InstanceFaultRollback this is the vm_state to set the instance to
11456 when handling one of those types of exceptions. By default the
11457 instance will be set to ACTIVE, but the caller should control this
11458 in case there have been no changes to the running state of the
11459 instance. For example, resizing a stopped server where prep_resize
11460 fails early and does not change the power state of the guest should
11461 not set the instance status to ACTIVE but remain STOPPED.
11462 This parameter is ignored for all other types of exceptions and the
11463 instance vm_state is set to ERROR.
11464 """
11465 # NOTE(mriedem): Why doesn't this method just save off the
11466 # original instance.vm_state here rather than use a parameter? Or use
11467 # instance_state=None as an override but default to the current
11468 # vm_state when rolling back.
11469 instance_uuid = instance.uuid
11470 try:
11471 yield
11472 except (NotImplementedError, exception.InstanceFaultRollback) as error:
11473 # Use reraise=False to determine if we want to raise the original
11474 # exception or something else.
11475 with excutils.save_and_reraise_exception(reraise=False) as ctxt:
11476 LOG.info("Setting instance back to %(state)s after: %(error)s",
11477 {'state': instance_state, 'error': error},
11478 instance_uuid=instance_uuid)
11479 self._instance_update(context, instance,
11480 vm_state=instance_state,
11481 task_state=None)
11482 if isinstance(error, exception.InstanceFaultRollback):
11483 # Raise the wrapped exception.
11484 raise error.inner_exception
11485 # Else re-raise the NotImplementedError.
11486 ctxt.reraise = True
11487 except Exception:
11488 LOG.exception('Setting instance vm_state to ERROR',
11489 instance_uuid=instance_uuid)
11490 with excutils.save_and_reraise_exception():
11491 # NOTE(mriedem): Why don't we pass clean_task_state=True here?
11492 self._set_instance_obj_error_state(instance)
11494 def _process_instance_event(self, instance, event):
11495 _event = self.instance_events.pop_instance_event(instance, event)
11496 if _event:
11497 LOG.debug('Processing event %(event)s',
11498 {'event': event.key}, instance=instance)
11499 _event.send(event)
11500 else:
11501 # If it's a network-vif-unplugged event and the instance is being
11502 # deleted or live migrated then we don't need to make this a
11503 # warning as it's expected. There are other expected things which
11504 # could trigger this event like detaching an interface, but we
11505 # don't have a task state for that.
11506 # TODO(mriedem): We have other move operations and things like
11507 # hard reboot (probably rebuild as well) which trigger this event
11508 # but nothing listens for network-vif-unplugged. We should either
11509 # handle those other known cases or consider just not logging a
11510 # warning if we get this event and the instance is undergoing some
11511 # task state transition.
11512 if (event.name == 'network-vif-unplugged' and
11513 instance.task_state in (
11514 task_states.DELETING, task_states.MIGRATING)):
11515 LOG.debug('Received event %s for instance with task_state %s.',
11516 event.key, instance.task_state, instance=instance)
11517 else:
11518 LOG.warning('Received unexpected event %(event)s for '
11519 'instance with vm_state %(vm_state)s and '
11520 'task_state %(task_state)s.',
11521 {'event': event.key,
11522 'vm_state': instance.vm_state,
11523 'task_state': instance.task_state},
11524 instance=instance)
11526 def _process_instance_vif_deleted_event(self, context, instance,
11527 deleted_vif_id):
11528 # If an attached port is deleted by neutron, it needs to
11529 # be detached from the instance.
11530 # And info cache needs to be updated.
11531 network_info = instance.info_cache.network_info
11532 for index, vif in enumerate(network_info): 11532 ↛ exitline 11532 didn't return from function '_process_instance_vif_deleted_event' because the loop on line 11532 didn't complete
11533 if vif['id'] == deleted_vif_id:
11534 LOG.info('Neutron deleted interface %(intf)s; '
11535 'detaching it from the instance and '
11536 'deleting it from the info cache',
11537 {'intf': vif['id']},
11538 instance=instance)
11539 profile = vif.get('profile', {}) or {} # profile can be None
11540 rps = profile.get('allocation')
11541 if rps: 11541 ↛ 11542line 11541 didn't jump to line 11542 because the condition on line 11541 was never true
11542 if isinstance(rps, dict):
11543 # if extended resource request extension is enabled
11544 # then we have a dict of providers, flatten it for the
11545 # log.
11546 rps = ','.join(rps.values())
11547 LOG.error(
11548 'The bound port %(port_id)s is deleted in Neutron but '
11549 'the resource allocation on the resource providers '
11550 '%(rp_uuid)s are leaked until the server '
11551 '%(server_uuid)s is deleted.',
11552 {'port_id': vif['id'],
11553 'rp_uuid': rps,
11554 'server_uuid': instance.uuid})
11556 del network_info[index]
11557 neutron.update_instance_cache_with_nw_info(
11558 self.network_api, context, instance, nw_info=network_info)
11559 try:
11560 self.driver.detach_interface(context, instance, vif)
11561 except NotImplementedError:
11562 # Not all virt drivers support attach/detach of interfaces
11563 # yet (like Ironic), so just ignore this.
11564 pass
11565 except exception.NovaException as ex:
11566 # If the instance was deleted before the interface was
11567 # detached, just log it at debug.
11568 log_level = (logging.DEBUG
11569 if isinstance(ex, exception.InstanceNotFound)
11570 else logging.WARNING)
11571 LOG.log(log_level,
11572 "Detach interface failed, "
11573 "port_id=%(port_id)s, reason: %(msg)s",
11574 {'port_id': deleted_vif_id, 'msg': ex},
11575 instance=instance)
11576 break
11578 @wrap_instance_event(prefix='compute')
11579 @wrap_instance_fault
11580 def extend_volume(self, context, instance, extended_volume_id):
11582 # If an attached volume is extended by cinder, it needs to
11583 # be extended by virt driver so host can detect its new size.
11584 # And bdm needs to be updated.
11585 LOG.debug('Handling volume-extended event for volume %(vol)s',
11586 {'vol': extended_volume_id}, instance=instance)
11588 try:
11589 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
11590 context, extended_volume_id, instance.uuid)
11591 except exception.NotFound:
11592 LOG.warning('Extend volume failed, '
11593 'volume %(vol)s is not attached to instance.',
11594 {'vol': extended_volume_id},
11595 instance=instance)
11596 return
11598 LOG.info('Cinder extended volume %(vol)s; '
11599 'extending it to detect new size',
11600 {'vol': extended_volume_id},
11601 instance=instance)
11602 volume = self.volume_api.get(context, bdm.volume_id)
11604 if bdm.connection_info is None: 11604 ↛ 11605line 11604 didn't jump to line 11605 because the condition on line 11604 was never true
11605 LOG.warning('Extend volume failed, '
11606 'attached volume %(vol)s has no connection_info',
11607 {'vol': extended_volume_id},
11608 instance=instance)
11609 return
11611 connection_info = jsonutils.loads(bdm.connection_info)
11612 bdm.volume_size = volume['size']
11613 bdm.save()
11615 if not self.driver.capabilities.get('supports_extend_volume', False):
11616 raise exception.ExtendVolumeNotSupported()
11618 try:
11619 self.driver.extend_volume(context, connection_info, instance,
11620 bdm.volume_size * units.Gi)
11621 except Exception as ex:
11622 LOG.warning('Extend volume failed, '
11623 'volume_id=%(volume_id)s, reason: %(msg)s',
11624 {'volume_id': extended_volume_id, 'msg': ex},
11625 instance=instance)
11626 raise
11628 @staticmethod
11629 def _is_state_valid_for_power_update_event(instance, target_power_state):
11630 """Check if the current state of the instance allows it to be
11631 a candidate for the power-update event.
11633 :param instance: The nova instance object.
11634 :param target_power_state: The desired target power state; this should
11635 either be "POWER_ON" or "POWER_OFF".
11636 :returns Boolean: True if the instance can be subjected to the
11637 power-update event.
11638 """
11639 if ((target_power_state == external_event_obj.POWER_ON and
11640 instance.task_state is None and
11641 instance.vm_state == vm_states.STOPPED and
11642 instance.power_state == power_state.SHUTDOWN) or
11643 (target_power_state == external_event_obj.POWER_OFF and
11644 instance.task_state is None and
11645 instance.vm_state == vm_states.ACTIVE and
11646 instance.power_state == power_state.RUNNING)):
11647 return True
11648 return False
11650 @wrap_exception()
11651 @reverts_task_state
11652 @wrap_instance_event(prefix='compute')
11653 @wrap_instance_fault
11654 def power_update(self, context, instance, target_power_state):
11655 """Power update of an instance prompted by an external event.
11656 :param context: The API request context.
11657 :param instance: The nova instance object.
11658 :param target_power_state: The desired target power state;
11659 this should either be "POWER_ON" or
11660 "POWER_OFF".
11661 """
11663 @utils.synchronized(instance.uuid)
11664 def do_power_update():
11665 LOG.debug('Handling power-update event with target_power_state %s '
11666 'for instance', target_power_state, instance=instance)
11667 if not self._is_state_valid_for_power_update_event(
11668 instance, target_power_state):
11669 pow_state = fields.InstancePowerState.from_index(
11670 instance.power_state)
11671 LOG.info('The power-update %(tag)s event for instance '
11672 '%(uuid)s is a no-op since the instance is in '
11673 'vm_state %(vm_state)s, task_state '
11674 '%(task_state)s and power_state '
11675 '%(power_state)s.',
11676 {'tag': target_power_state, 'uuid': instance.uuid,
11677 'vm_state': instance.vm_state,
11678 'task_state': instance.task_state,
11679 'power_state': pow_state})
11680 return
11681 LOG.debug("Trying to %s instance",
11682 target_power_state, instance=instance)
11683 if target_power_state == external_event_obj.POWER_ON:
11684 action = fields.NotificationAction.POWER_ON
11685 notification_name = "power_on."
11686 instance.task_state = task_states.POWERING_ON
11687 else:
11688 # It's POWER_OFF
11689 action = fields.NotificationAction.POWER_OFF
11690 notification_name = "power_off."
11691 instance.task_state = task_states.POWERING_OFF
11692 instance.progress = 0
11694 try:
11695 # Note that the task_state is set here rather than the API
11696 # because this is a best effort operation and deferring
11697 # updating the task_state until we get to the compute service
11698 # avoids error handling in the API and needing to account for
11699 # older compute services during rolling upgrades from Stein.
11700 # If we lose a race, UnexpectedTaskStateError is handled
11701 # below.
11702 instance.save(expected_task_state=[None])
11703 self._notify_about_instance_usage(context, instance,
11704 notification_name + "start")
11705 compute_utils.notify_about_instance_action(context, instance,
11706 self.host, action=action,
11707 phase=fields.NotificationPhase.START)
11708 # UnexpectedTaskStateError raised from the driver will be
11709 # handled below and not result in a fault, error notification
11710 # or failure of the instance action. Other driver errors like
11711 # NotImplementedError will be record a fault, send an error
11712 # notification and mark the instance action as failed.
11713 self.driver.power_update_event(instance, target_power_state)
11714 self._notify_about_instance_usage(context, instance,
11715 notification_name + "end")
11716 compute_utils.notify_about_instance_action(context, instance,
11717 self.host, action=action,
11718 phase=fields.NotificationPhase.END)
11719 except exception.UnexpectedTaskStateError as e:
11720 # Handling the power-update event is best effort and if we lost
11721 # a race with some other action happening to the instance we
11722 # just log it and return rather than fail the action.
11723 LOG.info("The power-update event was possibly preempted: %s ",
11724 e.format_message(), instance=instance)
11725 return
11726 do_power_update()
11728 @wrap_exception()
11729 def external_instance_event(self, context, instances, events):
11730 # NOTE(danms): Some event types are handled by the manager, such
11731 # as when we're asked to update the instance's info_cache. If it's
11732 # not one of those, look for some thread(s) waiting for the event and
11733 # unblock them if so.
11734 for event in events:
11735 instance = [inst for inst in instances
11736 if inst.uuid == event.instance_uuid][0]
11737 LOG.debug('Received event %(event)s',
11738 {'event': event.key},
11739 instance=instance)
11740 if event.name == 'network-changed':
11741 try:
11742 LOG.debug('Refreshing instance network info cache due to '
11743 'event %s.', event.key, instance=instance)
11744 self.network_api.get_instance_nw_info(
11745 context, instance, refresh_vif_id=event.tag)
11746 except exception.NotFound as e:
11747 LOG.info('Failed to process external instance event '
11748 '%(event)s due to: %(error)s',
11749 {'event': event.key, 'error': str(e)},
11750 instance=instance)
11751 elif event.name == 'network-vif-deleted':
11752 try:
11753 self._process_instance_vif_deleted_event(context,
11754 instance,
11755 event.tag)
11756 except exception.NotFound as e:
11757 LOG.info('Failed to process external instance event '
11758 '%(event)s due to: %(error)s',
11759 {'event': event.key, 'error': str(e)},
11760 instance=instance)
11761 elif event.name == 'volume-extended':
11762 self.extend_volume(context, instance, event.tag)
11763 elif event.name == 'power-update':
11764 self.power_update(context, instance, event.tag)
11765 else:
11766 self._process_instance_event(instance, event)
11768 @periodic_task.periodic_task(spacing=CONF.image_cache.manager_interval,
11769 external_process_ok=True)
11770 def _run_image_cache_manager_pass(self, context):
11771 """Run a single pass of the image cache manager."""
11773 if not self.driver.capabilities.get("has_imagecache", False): 11773 ↛ 11774line 11773 didn't jump to line 11774 because the condition on line 11773 was never true
11774 return
11776 # Determine what other nodes use this storage
11777 storage_users.register_storage_use(CONF.instances_path, CONF.host)
11778 nodes = storage_users.get_storage_users(CONF.instances_path)
11780 # Filter all_instances to only include those nodes which share this
11781 # storage path.
11782 # TODO(mikal): this should be further refactored so that the cache
11783 # cleanup code doesn't know what those instances are, just a remote
11784 # count, and then this logic should be pushed up the stack.
11785 filters = {'deleted': False,
11786 'soft_deleted': True,
11787 'host': nodes}
11788 filtered_instances = objects.InstanceList.get_by_filters(context,
11789 filters, expected_attrs=[], use_slave=True)
11791 self.driver.manage_image_cache(context, filtered_instances)
11793 def cache_images(self, context, image_ids):
11794 """Ask the virt driver to pre-cache a set of base images.
11796 :param context: The RequestContext
11797 :param image_ids: The image IDs to be cached
11798 :return: A dict, keyed by image-id where the values are one of:
11799 'cached' if the image was downloaded,
11800 'existing' if the image was already in the cache,
11801 'unsupported' if the virt driver does not support caching,
11802 'error' if the virt driver raised an exception.
11803 """
11805 results = {}
11807 LOG.info('Caching %i image(s) by request', len(image_ids))
11808 for image_id in image_ids:
11809 try:
11810 cached = self.driver.cache_image(context, image_id)
11811 if cached:
11812 results[image_id] = 'cached'
11813 else:
11814 results[image_id] = 'existing'
11815 except NotImplementedError:
11816 LOG.warning('Virt driver does not support image pre-caching;'
11817 ' ignoring request')
11818 # NOTE(danms): Yes, technically we could short-circuit here to
11819 # avoid trying the rest of the images, but it's very cheap to
11820 # just keep hitting the NotImplementedError to keep the logic
11821 # clean.
11822 results[image_id] = 'unsupported'
11823 except Exception as e:
11824 results[image_id] = 'error'
11825 LOG.error('Failed to cache image %(image_id)s: %(err)s',
11826 {'image_id': image_id,
11827 'err': e})
11829 return results
11831 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
11832 def _run_pending_deletes(self, context):
11833 """Retry any pending instance file deletes."""
11834 LOG.debug('Cleaning up deleted instances')
11835 filters = {'deleted': True,
11836 'soft_deleted': False,
11837 'host': CONF.host,
11838 'cleaned': False}
11839 attrs = ['system_metadata']
11840 with utils.temporary_mutation(context, read_deleted='yes'):
11841 instances = objects.InstanceList.get_by_filters(
11842 context, filters, expected_attrs=attrs, use_slave=True)
11843 LOG.debug('There are %d instances to clean', len(instances))
11845 for instance in instances:
11846 attempts = int(instance.system_metadata.get('clean_attempts', '0'))
11847 LOG.debug('Instance has had %(attempts)s of %(max)s '
11848 'cleanup attempts',
11849 {'attempts': attempts,
11850 'max': CONF.maximum_instance_delete_attempts},
11851 instance=instance)
11852 if attempts < CONF.maximum_instance_delete_attempts:
11853 success = self.driver.delete_instance_files(instance)
11855 instance.system_metadata['clean_attempts'] = str(attempts + 1)
11856 if success:
11857 instance.cleaned = True
11858 with utils.temporary_mutation(context, read_deleted='yes'):
11859 instance.save()
11861 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
11862 def _cleanup_incomplete_migrations(self, context):
11863 """Cleanup on failed resize/revert-resize operation and
11864 failed rollback live migration operation.
11866 During resize/revert-resize operation, or after a failed rollback
11867 live migration operation, if that instance gets deleted then instance
11868 files might remain either on source or destination compute node and
11869 other specific resources might not be cleaned up because of the race
11870 condition.
11871 """
11872 LOG.debug('Cleaning up deleted instances with incomplete migration ')
11873 migration_filters = {'host': CONF.host,
11874 'status': 'error'}
11875 migrations = objects.MigrationList.get_by_filters(context,
11876 migration_filters)
11878 if not migrations: 11878 ↛ 11879line 11878 didn't jump to line 11879 because the condition on line 11878 was never true
11879 return
11881 inst_uuid_from_migrations = set([migration.instance_uuid for migration
11882 in migrations])
11884 inst_filters = {'deleted': True, 'soft_deleted': False,
11885 'uuid': inst_uuid_from_migrations}
11886 attrs = ['info_cache', 'security_groups', 'system_metadata']
11887 with utils.temporary_mutation(context, read_deleted='yes'):
11888 instances = objects.InstanceList.get_by_filters(
11889 context, inst_filters, expected_attrs=attrs, use_slave=True)
11891 for instance in instances:
11892 if instance.host == CONF.host: 11892 ↛ 11893line 11892 didn't jump to line 11893 because the condition on line 11892 was never true
11893 continue
11894 for migration in migrations: 11894 ↛ 11891line 11894 didn't jump to line 11891 because the loop on line 11894 didn't complete
11895 if instance.uuid != migration.instance_uuid:
11896 continue
11897 self.driver.delete_instance_files(instance)
11898 # we are not sure whether the migration_context is applied
11899 # during incompleted migrating, we need to apply/revert
11900 # migration_context to get instance object content matching
11901 # current host.
11902 revert = (True if migration.source_compute == CONF.host
11903 else False)
11904 with instance.mutated_migration_context(revert=revert):
11905 self.driver.cleanup_lingering_instance_resources(instance)
11907 try:
11908 migration.status = 'failed'
11909 migration.save()
11910 except exception.MigrationNotFound:
11911 LOG.warning("Migration %s is not found.",
11912 migration.id,
11913 instance=instance)
11914 break
11916 @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,
11917 exception.QemuGuestAgentNotEnabled,
11918 exception.NovaException,
11919 NotImplementedError)
11920 @wrap_exception()
11921 def quiesce_instance(self, context, instance):
11922 """Quiesce an instance on this host."""
11923 context = context.elevated()
11924 image_meta = objects.ImageMeta.from_instance(instance)
11925 self.driver.quiesce(context, instance, image_meta)
11927 def _wait_for_snapshots_completion(self, context, mapping):
11928 for mapping_dict in mapping:
11929 if mapping_dict.get('source_type') == 'snapshot': 11929 ↛ 11928line 11929 didn't jump to line 11928 because the condition on line 11929 was always true
11931 def _wait_snapshot():
11932 snapshot = self.volume_api.get_snapshot(
11933 context, mapping_dict['snapshot_id'])
11934 if snapshot.get('status') != 'creating':
11935 raise loopingcall.LoopingCallDone()
11937 timer = loopingcall.FixedIntervalLoopingCall(_wait_snapshot)
11938 timer.start(interval=0.5).wait()
11940 @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,
11941 exception.QemuGuestAgentNotEnabled,
11942 exception.NovaException,
11943 NotImplementedError)
11944 @wrap_exception()
11945 def unquiesce_instance(self, context, instance, mapping=None):
11946 """Unquiesce an instance on this host.
11948 If snapshots' image mapping is provided, it waits until snapshots are
11949 completed before unqueiscing.
11950 """
11951 context = context.elevated()
11952 if mapping: 11952 ↛ 11959line 11952 didn't jump to line 11959 because the condition on line 11952 was always true
11953 try:
11954 self._wait_for_snapshots_completion(context, mapping)
11955 except Exception as error:
11956 LOG.exception("Exception while waiting completion of "
11957 "volume snapshots: %s",
11958 error, instance=instance)
11959 image_meta = objects.ImageMeta.from_instance(instance)
11960 self.driver.unquiesce(context, instance, image_meta)
11962 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
11963 def _cleanup_expired_console_auth_tokens(self, context):
11964 """Remove all expired console auth tokens.
11966 Console authorization tokens and their connection data are stored
11967 in the database when a user asks for a console connection to an
11968 instance. After a time they expire. We periodically remove any expired
11969 tokens from the database.
11970 """
11971 objects.ConsoleAuthToken.clean_expired_console_auths(context)
11973 def _claim_pci_for_instance_vifs(self, ctxt, instance):
11974 """Claim PCI devices for the instance's VIFs on the compute node
11976 :param ctxt: Context
11977 :param instance: Instance object
11978 :return: <port ID: PciDevice> mapping for the VIFs that yielded a
11979 PCI claim on the compute node
11980 """
11981 pci_req_id_to_port_id = {}
11982 pci_reqs = []
11983 port_id_to_pci_dev = {}
11985 for vif in instance.get_network_info():
11986 pci_req = pci_req_module.get_instance_pci_request_from_vif(
11987 ctxt,
11988 instance,
11989 vif)
11990 if pci_req:
11991 pci_req_id_to_port_id[pci_req.request_id] = vif['id']
11992 pci_reqs.append(pci_req)
11994 if pci_reqs:
11995 # Create PCI requests and claim against PCI resource tracker
11996 # NOTE(adrianc): We claim against the same requests as on the
11997 # source node.
11998 vif_pci_requests = objects.InstancePCIRequests(
11999 requests=pci_reqs,
12000 instance_uuid=instance.uuid)
12002 # if we are called during the live migration with NUMA topology
12003 # support the PCI claim needs to consider the destination NUMA
12004 # topology that is then stored in the migration_context
12005 dest_topo = None
12006 if instance.migration_context: 12006 ↛ 12009line 12006 didn't jump to line 12009 because the condition on line 12006 was always true
12007 dest_topo = instance.migration_context.new_numa_topology
12009 claimed_pci_devices_objs = self.rt.claim_pci_devices(
12010 ctxt, vif_pci_requests, dest_topo)
12012 # Update VIFMigrateData profile with the newly claimed PCI
12013 # device
12014 for pci_dev in claimed_pci_devices_objs:
12015 LOG.debug("PCI device: %s Claimed on destination node",
12016 pci_dev.address)
12017 port_id = pci_req_id_to_port_id[pci_dev.request_id]
12018 port_id_to_pci_dev[port_id] = pci_dev
12020 return port_id_to_pci_dev
12022 def _update_migrate_vifs_profile_with_pci(self,
12023 migrate_vifs,
12024 port_id_to_pci_dev):
12025 """Update migrate vifs profile with the claimed PCI devices
12027 :param migrate_vifs: list of VIFMigrateData objects
12028 :param port_id_to_pci_dev: a <port_id: PciDevice> mapping
12029 :return: None.
12030 """
12031 for mig_vif in migrate_vifs:
12032 port_id = mig_vif.port_id
12033 if port_id not in port_id_to_pci_dev:
12034 continue
12036 pci_dev = port_id_to_pci_dev[port_id]
12037 profile = copy.deepcopy(mig_vif.source_vif['profile'])
12038 profile['pci_slot'] = pci_dev.address
12039 profile['pci_vendor_info'] = ':'.join([pci_dev.vendor_id,
12040 pci_dev.product_id])
12041 if profile.get('card_serial_number'): 12041 ↛ 12044line 12041 didn't jump to line 12044 because the condition on line 12041 was never true
12042 # Assume it is there since Nova makes sure that PCI devices
12043 # tagged as remote-managed have a serial in PCI VPD.
12044 profile['card_serial_number'] = pci_dev.card_serial_number
12045 if profile.get('pf_mac_address'): 12045 ↛ 12046line 12045 didn't jump to line 12046 because the condition on line 12045 was never true
12046 profile['pf_mac_address'] = pci_dev.sriov_cap['pf_mac_address']
12047 if profile.get('vf_num'): 12047 ↛ 12048line 12047 didn't jump to line 12048 because the condition on line 12047 was never true
12048 profile['vf_num'] = pci_dev.sriov_cap['vf_num']
12050 if pci_dev.mac_address:
12051 profile['device_mac_address'] = pci_dev.mac_address
12053 mig_vif.profile = profile
12054 LOG.debug("Updating migrate VIF profile for port %(port_id)s:"
12055 "%(profile)s", {'port_id': port_id,
12056 'profile': profile})
12059# TODO(sbauza): Remove this proxy class in the X release once we drop the 5.x
12060# support.
12061# NOTE(sbauza): This proxy class will support the existing <=5.13 RPC calls
12062# from any RPC client but will also make sure that the new 6.0 RPC calls will
12063# be supported.
12064class _ComputeV5Proxy(object):
12066 target = messaging.Target(version='5.13')
12068 def __init__(self, manager):
12069 self.manager = manager
12071 def __getattr__(self, name):
12072 # NOTE(sbauza): Proxying all the other methods but the V5 ones.
12073 return getattr(self.manager, name)
12075 # 5.0 support for block_migration argument
12076 def pre_live_migration(self, context, instance, block_migration, disk,
12077 migrate_data):
12078 return self.manager.pre_live_migration(context, instance, disk,
12079 migrate_data)
12081 # 5.1 support for legacy request_spec argument
12082 def prep_resize(self, context, image, instance, instance_type,
12083 request_spec, filter_properties, node,
12084 clean_shutdown, migration, host_list):
12085 if not isinstance(request_spec, objects.RequestSpec):
12086 # Prior to compute RPC API 5.1 conductor would pass a legacy dict
12087 # version of the request spec to compute and since Stein compute
12088 # could be sending that back to conductor on reschedule, so if we
12089 # got a dict convert it to an object.
12090 # TODO(mriedem): We can drop this compat code when we only support
12091 # compute RPC API >=6.0.
12092 request_spec = objects.RequestSpec.from_primitives(
12093 context, request_spec, filter_properties)
12094 # We don't have to set the new flavor on the request spec because
12095 # if we got here it was due to a reschedule from the compute and
12096 # the request spec would already have the new flavor in it from the
12097 # else block below.
12098 self.manager.prep_resize(context, image, instance, instance_type,
12099 request_spec, filter_properties, node,
12100 clean_shutdown, migration, host_list)
12102 # 5.2 support for optional request_spec argument
12103 def resize_instance(self, context, instance, image,
12104 migration, instance_type, clean_shutdown,
12105 request_spec=None):
12106 self.manager.resize_instance(context, instance, image,
12107 migration, instance_type, clean_shutdown,
12108 request_spec)
12110 # 5.2 support for optional request_spec argument
12111 def finish_resize(self, context, disk_info, image, instance,
12112 migration, request_spec=None):
12113 self.manager.finish_resize(context, disk_info, image, instance,
12114 migration, request_spec)
12116 # 5.2 support for optional request_spec argument
12117 def revert_resize(self, context, instance, migration, request_spec=None):
12118 self.manager.revert_resize(context, instance, migration, request_spec)
12120 # 5.2 support for optional request_spec argument
12121 def finish_revert_resize(
12122 self, context, instance, migration, request_spec=None):
12123 self.manager.finish_revert_resize(context, instance, migration,
12124 request_spec)
12126 # 5.2 support for optional request_spec argument
12127 # 5.13 support for optional accel_uuids argument
12128 def unshelve_instance(self, context, instance, image, filter_properties,
12129 node, request_spec=None, accel_uuids=None):
12130 self.manager.unshelve_instance(context, instance, image,
12131 filter_properties, node, request_spec,
12132 accel_uuids or [])
12134 # 5.3 support for optional migration and limits arguments
12135 def check_can_live_migrate_destination(self, ctxt, instance,
12136 block_migration, disk_over_commit,
12137 migration=None, limits=None):
12138 return self.manager.check_can_live_migrate_destination(
12139 ctxt, instance, block_migration, disk_over_commit,
12140 migration, limits)
12142 # 5.11 support for optional accel_uuids argument
12143 def build_and_run_instance(self, context, instance, image, request_spec,
12144 filter_properties, admin_password=None,
12145 injected_files=None, requested_networks=None,
12146 security_groups=None, block_device_mapping=None,
12147 node=None, limits=None, host_list=None, accel_uuids=None):
12148 self.manager.build_and_run_instance(
12149 context, instance, image, request_spec,
12150 filter_properties, accel_uuids, admin_password,
12151 injected_files, requested_networks,
12152 security_groups, block_device_mapping,
12153 node, limits, host_list)
12155 # 5.12 support for optional accel_uuids argument
12156 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
12157 injected_files, new_pass, orig_sys_metadata,
12158 bdms, recreate, on_shared_storage,
12159 preserve_ephemeral, migration,
12160 scheduled_node, limits, request_spec,
12161 accel_uuids=None):
12162 self.manager.rebuild_instance(
12163 context, instance, orig_image_ref, image_ref,
12164 injected_files, new_pass, orig_sys_metadata,
12165 bdms, recreate, on_shared_storage,
12166 preserve_ephemeral, migration,
12167 scheduled_node, limits, request_spec,
12168 accel_uuids, False, None)
12170 # 5.13 support for optional accel_uuids argument
12171 def shelve_instance(self, context, instance, image_id,
12172 clean_shutdown, accel_uuids=None):
12173 self.manager.shelve_instance(context, instance, image_id,
12174 clean_shutdown, accel_uuids)
12176 # 5.13 support for optional accel_uuids argument
12177 def shelve_offload_instance(self, context, instance, clean_shutdown,
12178 accel_uuids=None):
12179 self.manager.shelve_offload_instance(
12180 context, instance, clean_shutdown, accel_uuids)
12182 # 6.0 drop unused request_spec argument
12183 def prep_snapshot_based_resize_at_dest(
12184 self, ctxt, instance, flavor, nodename, migration, limits,
12185 request_spec):
12186 return self.manager.prep_snapshot_based_resize_at_dest(
12187 ctxt, instance, flavor, nodename, migration, limits)
12189 # 6.0 drop unused request_spec argument
12190 def finish_snapshot_based_resize_at_dest(
12191 self, ctxt, instance, migration, snapshot_id, request_spec):
12192 self.manager.finish_snapshot_based_resize_at_dest(
12193 ctxt, instance, migration, snapshot_id)
12195 # 6.0 drop unused instance argument
12196 def check_instance_shared_storage(self, ctxt, instance, data):
12197 return self.manager.check_instance_shared_storage(ctxt, data)